Clean the environment an load thr datasets that we are going to use.
rm(list=ls())
# Who dataset: A set of social, economic, health, and political indicators
who = read.csv('WHO.csv')
# Economic freedom index dataset: A dataset with plenty of economic indicators.
economic_freedom = read.csv('index2022_data.csv')
Libraries
if (!require("mice")){
install.packages("mice")
}
## Loading required package: mice
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(mice)
if (!require("kernlab")){
install.packages('kernlab')
}
## Loading required package: kernlab
library(kernlab)
if (!require("countrycode")){
install.packages('countrycode')
}
## Loading required package: countrycode
library(countrycode)
if (!require("rworldmap")){
install.packages('rworldmap')
}
## Loading required package: rworldmap
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type : vignette('rworldmap')
library(rworldmap)
if (!require("factoextra")){
install.packages('factoextra')
}
## Loading required package: factoextra
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:kernlab':
##
## alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(factoextra)
if (!require("igraph")){
install.packages('igraph')
}
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(igraph)
if (!require("factoextra")){
install.packages('factoextra')
}
library(factoextra)
if (!require("cluster")){
install.packages('cluster')
}
## Loading required package: cluster
library(cluster)
if (!require("mclust")){
install.packages('mclust')
}
## Loading required package: mclust
## Package 'mclust' version 5.4.10
## Type 'citation("mclust")' for citing this R package in publications.
library(mclust)
if (!require("GGally")){
install.packages('GGally')
}
## Loading required package: GGally
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(GGally)
if (!require("tidyverse")){
install.packages('tidyverse')
}
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::alpha() masks kernlab::alpha()
## ✖ dplyr::as_data_frame() masks tibble::as_data_frame(), igraph::as_data_frame()
## ✖ purrr::compose() masks igraph::compose()
## ✖ purrr::cross() masks mclust::cross(), kernlab::cross()
## ✖ tidyr::crossing() masks igraph::crossing()
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::groups() masks igraph::groups()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::map() masks mclust::map()
## ✖ purrr::simplify() masks igraph::simplify()
library(tidyverse)
if (!require("VIM")){
install.packages('VIM')
}
## Loading required package: VIM
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
##
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
##
## The following object is masked from 'package:mclust':
##
## diabetes
##
## The following object is masked from 'package:datasets':
##
## sleep
library(VIM)
if (!require("Quandl")){
install.packages('Quandl')
}
## Loading required package: Quandl
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
library(Quandl)
if (!require("lubridate")){
install.packages('lubridate')
}
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:igraph':
##
## %--%, union
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(lubridate)
if (!require("quantmod")){
install.packages('quantmod')
}
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(quantmod)
if (!require("ggpubr")){
install.packages('ggpubr')
}
## Loading required package: ggpubr
library("ggpubr")
if (!require("outliers")){
install.packages('outliers')
}
## Loading required package: outliers
library(outliers)
if (!require("ggplot2")){
install.packages("ggplot2")
}
library(ggplot2)
if (!require("Amelia")){
install.packages("Amelia")
}
## Loading required package: Amelia
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library("Amelia")
Let’s have a look at the datasets
head(who)
head(economic_freedom)
summary(who)
## Country CountryID Continent
## Length:202 Min. : 1.00 Min. :1.000
## Class :character 1st Qu.: 51.25 1st Qu.:2.000
## Mode :character Median :101.50 Median :3.000
## Mean :101.50 Mean :3.579
## 3rd Qu.:151.75 3rd Qu.:5.000
## Max. :202.00 Max. :7.000
##
## Adolescent.fertility.rate.... Adult.literacy.rate....
## Min. : 0.00 Min. :23.60
## 1st Qu.: 19.00 1st Qu.:68.40
## Median : 46.00 Median :86.50
## Mean : 59.46 Mean :78.87
## 3rd Qu.: 91.00 3rd Qu.:95.30
## Max. :199.00 Max. :99.80
## NA's :25 NA's :71
## Gross.national.income.per.capita..PPP.international...
## Min. : 260
## 1st Qu.: 2112
## Median : 6175
## Mean :11250
## 3rd Qu.:14502
## Max. :60870
## NA's :24
## Net.primary.school.enrolment.ratio.female....
## Min. : 6.00
## 1st Qu.: 79.00
## Median : 90.00
## Mean : 84.03
## 3rd Qu.: 96.00
## Max. :100.00
## NA's :23
## Net.primary.school.enrolment.ratio.male.... Population..in.thousands..total
## Min. : 11.0 Min. : 2
## 1st Qu.: 79.5 1st Qu.: 1340
## Median : 90.0 Median : 6762
## Mean : 85.7 Mean : 34098
## 3rd Qu.: 96.0 3rd Qu.: 21732
## Max. :100.0 Max. :1328474
## NA's :23 NA's :9
## Population.annual.growth.rate.... Population.in.urban.areas....
## Min. :-2.500 Min. : 10.00
## 1st Qu.: 0.500 1st Qu.: 36.00
## Median : 1.300 Median : 57.00
## Mean : 1.298 Mean : 54.91
## 3rd Qu.: 2.100 3rd Qu.: 73.00
## Max. : 4.300 Max. :100.00
## NA's :9 NA's :9
## Population.living.below.the.poverty.line....living.on..lt..US.1.per.day.
## Min. : 2.00
## 1st Qu.: 2.00
## Median : 7.45
## Mean :16.02
## 3rd Qu.:23.05
## Max. :70.80
## NA's :130
## Population.median.age..years. Population.proportion.over.60....
## Min. :15.00 Min. : 2.0
## 1st Qu.:20.00 1st Qu.: 5.0
## Median :25.00 Median : 8.0
## Mean :26.74 Mean :10.3
## 3rd Qu.:35.00 3rd Qu.:15.0
## Max. :43.00 Max. :27.0
## NA's :23 NA's :9
## Population.proportion.under.15.... Registration.coverage.of.births....
## Min. :14.00 Min. : 3.00
## 1st Qu.:20.00 1st Qu.:72.00
## Median :31.00 Median :90.00
## Mean :30.11 Mean :77.26
## 3rd Qu.:39.00 3rd Qu.:90.00
## Max. :49.00 Max. :90.00
## NA's :9 NA's :39
## Total.fertility.rate..per.woman.
## Min. :1.200
## 1st Qu.:1.800
## Median :2.500
## Mean :3.005
## 3rd Qu.:3.900
## Max. :7.300
## NA's :10
## Antenatal.care.coverage...at.least.four.visits....
## Min. : 7
## 1st Qu.:41
## Median :61
## Mean :58
## 3rd Qu.:76
## Max. :99
## NA's :117
## Antiretroviral.therapy.coverage.among.HIV.infected.pregt.women.for.PMTCT....
## Min. : 1.00
## 1st Qu.: 7.75
## Median :14.00
## Mean :22.53
## 3rd Qu.:27.50
## Max. :95.00
## NA's :166
## Antiretroviral.therapy.coverage.among.people.with.advanced.HIV.infections....
## Min. : 1.00
## 1st Qu.:12.00
## Median :22.00
## Mean :28.36
## 3rd Qu.:38.50
## Max. :95.00
## NA's :99
## Births.attended.by.skilled.health.personnel....
## Min. : 6.00
## 1st Qu.: 60.00
## Median : 95.00
## Mean : 79.52
## 3rd Qu.:100.00
## Max. :100.00
## NA's :21
## Births.by.caesarean.section....
## Min. : 0.00
## 1st Qu.: 3.00
## Median : 9.00
## Mean :10.22
## 3rd Qu.:16.00
## Max. :31.00
## NA's :125
## Children.aged.6.59.months.who.received.vitamin.A.supplementation....
## Min. :11.10
## 1st Qu.:33.70
## Median :49.80
## Mean :51.22
## 3rd Qu.:68.20
## Max. :84.10
## NA's :177
## Children.aged..lt.5.years.sleeping.under.insecticide.treated.nets....
## Min. : 0.100
## 1st Qu.: 1.325
## Median : 5.600
## Mean : 9.891
## 3rd Qu.:13.075
## Max. :49.000
## NA's :156
## Children.aged..lt.5.years.who.received.any.antimalarial.treatment.for.fever....
## Min. : 0.20
## 1st Qu.: 8.10
## Median :30.95
## Mean :29.63
## 3rd Qu.:48.45
## Max. :62.70
## NA's :152
## Children.aged..lt.5.years.with.ARI.symptoms.taken.to.facility....
## Min. : 6.50
## 1st Qu.:33.92
## Median :44.70
## Mean :44.48
## 3rd Qu.:55.98
## Max. :76.40
## NA's :166
## Children.aged..lt.5.years.with.diarrhoea.receiving.ORT....
## Min. :31.90
## 1st Qu.:53.50
## Median :58.90
## Mean :58.59
## 3rd Qu.:66.80
## Max. :83.40
## NA's :165
## Contraceptive.prevalence....
## Min. : 2.80
## 1st Qu.:25.70
## Median :44.35
## Mean :44.45
## 3rd Qu.:65.75
## Max. :90.20
## NA's :94
## Neonates.protected.at.birth.against.neonatal.tetanus..PAB.....
## Min. : 5.00
## 1st Qu.:72.00
## Median :83.00
## Mean :78.93
## 3rd Qu.:89.00
## Max. :96.00
## NA's :98
## One.year.olds.immunized.with.MCV
## Min. :23.00
## 1st Qu.:80.00
## Median :92.00
## Mean :86.81
## 3rd Qu.:97.00
## Max. :99.00
## NA's :9
## One.year.olds.immunized.with.three.doses.of.diphtheria.tetanus.toxoid.and.pertussis..DTP3.....
## Min. :20.00
## 1st Qu.:83.00
## Median :94.00
## Mean :87.61
## 3rd Qu.:97.00
## Max. :99.00
## NA's :9
## One.year.olds.immunized.with.three.doses.of.Hepatitis.B..HepB3.....
## Min. : 4.00
## 1st Qu.:82.50
## Median :92.00
## Mean :86.37
## 3rd Qu.:97.00
## Max. :99.00
## NA's :31
## One.year.olds.immunized.with.three.doses.of.Hib..Hib3..vaccine....
## Min. :11.00
## 1st Qu.:84.25
## Median :94.00
## Mean :88.21
## 3rd Qu.:97.00
## Max. :99.00
## NA's :88
## Tuberculosis.detection.rate.under.DOTS....
## Min. : 0.00
## 1st Qu.: 42.00
## Median : 62.00
## Mean : 62.57
## 3rd Qu.: 80.00
## Max. :284.00
## NA's :15
## Tuberculosis.treatment.success.under.DOTS....
## Min. : 0.00
## 1st Qu.: 71.00
## Median : 80.00
## Mean : 77.59
## 3rd Qu.: 87.00
## Max. :100.00
## NA's :25
## Women.who.have.had.mammography.... Women.who.have.had.PAP.smear....
## Min. : 0 Min. : 0.00
## 1st Qu.: 2 1st Qu.: 6.75
## Median :16 Median :40.00
## Mean :29 Mean :38.04
## 3rd Qu.:54 3rd Qu.:67.25
## Max. :98 Max. :83.00
## NA's :129 NA's :130
## Community.and.traditional.health.workers.density..per.10.000.population.
## Min. : 1.000
## 1st Qu.: 2.000
## Median : 4.000
## Mean : 9.194
## 3rd Qu.:14.000
## Max. :43.000
## NA's :171
## Dentistry.personnel.density..per.10.000.population.
## Min. : 1.000
## 1st Qu.: 2.000
## Median : 5.000
## Mean : 5.211
## 3rd Qu.: 8.000
## Max. :16.000
## NA's :88
## Environment.and.public.health.workers.density..per.10.000.population.
## Min. : 1.00
## 1st Qu.: 1.00
## Median : 2.00
## Mean : 2.81
## 3rd Qu.: 3.00
## Max. :10.00
## NA's :181
## External.resources.for.health.as.percentage.of.total.expenditure.on.health
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.300
## Mean : 9.717
## 3rd Qu.:13.800
## Max. :73.100
## NA's :11
## General.government.expenditure.on.health.as.percentage.of.total.expenditure.on.health
## Min. :12.30
## 1st Qu.:44.80
## Median :62.80
## Mean :60.11
## 3rd Qu.:76.60
## Max. :98.60
## NA's :9
## General.government.expenditure.on.health.as.percentage.of.total.government.expenditure
## Min. : 1.30
## 1st Qu.: 7.40
## Median :10.70
## Mean :11.06
## 3rd Qu.:14.00
## Max. :29.80
## NA's :9
## Hospital.beds..per.10.000.population.
## Min. : 1.00
## 1st Qu.: 12.00
## Median : 26.00
## Mean : 32.17
## 3rd Qu.: 48.25
## Max. :141.00
## NA's :22
## Laboratory.health.workers.density..per.10.000.population.
## Min. : 1.000
## 1st Qu.: 2.000
## Median : 3.000
## Mean : 4.568
## 3rd Qu.: 5.000
## Max. :23.000
## NA's :165
## Number.of.community.and.traditional.health.workers
## Min. : 0
## 1st Qu.: 133
## Median : 968
## Mean : 10636
## 3rd Qu.: 5528
## Max. :115761
## NA's :151
## Number.of.dentistry.personnel Number.of.environment.and.public.health.workers
## Min. : 1.0 Min. : 9
## 1st Qu.: 58.5 1st Qu.: 101
## Median : 850.0 Median : 238
## Mean : 9901.0 Mean : 5142
## 3rd Qu.: 4484.2 3rd Qu.: 1541
## Max. :463663.0 Max. :167080
## NA's :12 NA's :129
## Number.of.laboratory.health.workers Number.of.nursing.and.midwifery.personnel
## Min. : 17 Min. : 22
## 1st Qu.: 195 1st Qu.: 2499
## Median : 690 Median : 12840
## Mean : 15658 Mean : 93414
## 3rd Qu.: 3816 3rd Qu.: 46930
## Max. :651035 Max. :2669603
## NA's :123 NA's :9
## Number.of.other.health.service.providers Number.of.pharmaceutical.personnel
## Min. : 4 Min. : 1
## 1st Qu.: 566 1st Qu.: 108
## Median : 1960 Median : 1002
## Mean : 100960 Mean : 14090
## 3rd Qu.: 19142 3rd Qu.: 5046
## Max. :4138567 Max. :559408
## NA's :107 NA's :36
## Number.of.physicians
## Min. : 4
## 1st Qu.: 345
## Median : 5187
## Mean : 43591
## 3rd Qu.: 28812
## Max. :1862630
## NA's :9
## Nursing.and.midwifery.personnel.density..per.10.000.population.
## Min. : 1.00
## 1st Qu.: 9.00
## Median : 29.00
## Mean : 43.05
## 3rd Qu.: 57.00
## Max. :955.00
## NA's :9
## Other.health.service.providers.density..per.10.000.population.
## Min. : 1.00
## 1st Qu.: 3.00
## Median : 7.50
## Mean : 17.21
## 3rd Qu.: 18.25
## Max. :145.00
## NA's :132
## Out.of.pocket.expenditure.as.percentage.of.private.expenditure.on.health
## Min. : 14.40
## 1st Qu.: 72.50
## Median : 86.70
## Mean : 80.44
## 3rd Qu.: 96.00
## Max. :100.00
## NA's :9
## Per.capita.government.expenditure.on.health..PPP.int....
## Min. : 4
## 1st Qu.: 65
## Median : 219
## Mean : 587
## 3rd Qu.: 565
## Max. :5309
## NA's :9
## Per.capita.government.expenditure.on.health.at.average.exchange.rate..US..
## Min. : 0.0
## 1st Qu.: 19.0
## Median : 108.0
## Mean : 558.6
## 3rd Qu.: 381.0
## Max. :5991.0
## NA's :9
## Per.capita.total.expenditure.on.health..PPP.int....
## Min. : 15.0
## 1st Qu.: 116.0
## Median : 353.0
## Mean : 847.3
## 3rd Qu.: 869.0
## Max. :7154.0
## NA's :9
## Per.capita.total.expenditure.on.health.at.average.exchange.rate..US..
## Min. : 0.0
## 1st Qu.: 42.0
## Median : 217.0
## Mean : 774.8
## 3rd Qu.: 568.0
## Max. :6714.0
## NA's :9
## Pharmaceutical.personnel.density..per.10.000.population.
## Min. : 1.000
## 1st Qu.: 2.000
## Median : 5.000
## Mean : 5.728
## 3rd Qu.: 8.000
## Max. :20.000
## NA's :99
## Physicians.density..per.10.000.population.
## Min. : 1.00
## 1st Qu.: 5.00
## Median : 13.00
## Mean : 19.77
## 3rd Qu.: 27.00
## Max. :474.00
## NA's :32
## Private.expenditure.on.health.as.percentage.of.total.expenditure.on.health
## Min. : 1.40
## 1st Qu.:23.40
## Median :37.20
## Mean :39.89
## 3rd Qu.:55.20
## Max. :87.70
## NA's :9
## Private.prepaid.plans.as.percentage.of.private.expenditure.on.health
## Min. : 0.00
## 1st Qu.: 0.10
## Median : 5.20
## Mean :10.93
## 3rd Qu.:14.20
## Max. :79.50
## NA's :32
## Ratio.of.health.management.and.support.workers.to.health.service.providers
## Min. : 0.000
## 1st Qu.: 0.100
## Median : 0.200
## Mean : 1.285
## 3rd Qu.: 0.400
## Max. :69.700
## NA's :132
## Ratio.of.nurses.and.midwives.to.physicians
## Min. : 0.100
## 1st Qu.: 2.000
## Median : 3.000
## Mean : 4.846
## 3rd Qu.: 5.300
## Max. :39.400
## NA's :9
## Social.security.expenditure.on.health.as.percentage.of.general.government.expenditure.on.health
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 2.95
## Mean :24.46
## 3rd Qu.:45.98
## Max. :98.40
## NA's :26
## Total.expenditure.on.health.as.percentage.of.gross.domestic.product
## Min. : 1.500
## 1st Qu.: 4.500
## Median : 6.000
## Mean : 6.309
## 3rd Qu.: 7.600
## Max. :16.400
## NA's :9
## Births.attended.by.skilled.health.personnel.....highest.educational.level.of.mother
## Min. :28.00
## 1st Qu.:79.08
## Median :88.25
## Mean :84.50
## 3rd Qu.:94.55
## Max. :99.80
## NA's :142
## Births.attended.by.skilled.health.personnel.....highest.wealth.quintile
## Min. : 26.60
## 1st Qu.: 84.17
## Median : 91.15
## Mean : 86.16
## 3rd Qu.: 98.10
## Max. :100.00
## NA's :146
## Births.attended.by.skilled.health.personnel.....lowest.educational.level.of.mother
## Min. : 2.30
## 1st Qu.:21.55
## Median :33.60
## Mean :40.61
## 3rd Qu.:54.45
## Max. :99.70
## NA's :142
## Births.attended.by.skilled.health.personnel.....lowest.wealth.quintile
## Min. : 0.70
## 1st Qu.:14.65
## Median :28.85
## Mean :37.06
## 3rd Qu.:51.73
## Max. :99.20
## NA's :146
## Births.attended.by.skilled.health.personnel.....rural
## Min. : 2.60
## 1st Qu.:27.48
## Median :39.70
## Mean :47.37
## 3rd Qu.:68.53
## Max. :99.50
## NA's :142
## Births.attended.by.skilled.health.personnel.....urban
## Min. :29.60
## 1st Qu.:73.12
## Median :83.55
## Mean :79.91
## 3rd Qu.:92.45
## Max. :99.60
## NA's :142
## Births.attended.by.skilled.health.personnel.difference.highest.lowest.educational.level.of.mother
## Min. :-1.40
## 1st Qu.:34.02
## Median :49.25
## Mean :43.89
## 3rd Qu.:57.77
## Max. :75.90
## NA's :142
## Births.attended.by.skilled.health.personnel.difference.highest.lowest.wealth.quintile
## Min. :-0.70
## 1st Qu.:37.55
## Median :52.50
## Mean :49.10
## 3rd Qu.:66.25
## Max. :83.10
## NA's :146
## Births.attended.by.skilled.health.personnel.difference.urban.rural
## Min. :-1.10
## 1st Qu.:22.23
## Median :34.95
## Mean :32.53
## 3rd Qu.:43.48
## Max. :62.40
## NA's :142
## Births.attended.by.skilled.health.personnel.ratio.highest.lowest.educational.level.of.mother
## Min. : 1.000
## 1st Qu.: 1.750
## Median : 2.600
## Mean : 3.342
## 3rd Qu.: 3.825
## Max. :25.100
## NA's :142
## Births.attended.by.skilled.health.personnel.ratio.highest.lowest.wealth.quintile
## Min. : 1.000
## 1st Qu.: 1.800
## Median : 3.150
## Mean : 4.912
## 3rd Qu.: 5.775
## Max. :38.000
## NA's :146
## Births.attended.by.skilled.health.personnel.ratio.urban.rural
## Min. : 1.000
## 1st Qu.: 1.300
## Median : 1.850
## Mean : 2.513
## 3rd Qu.: 2.825
## Max. :17.200
## NA's :142
## Measles.immunization.coverage.among.one.year.olds.....highest.educational.level.of.mother
## Min. :53.70
## 1st Qu.:77.25
## Median :85.25
## Mean :83.43
## 3rd Qu.:90.90
## Max. :99.10
## NA's :142
## Measles.immunization.coverage.among.one.year.olds.....highest.wealth.quintile
## Min. :38.10
## 1st Qu.:73.92
## Median :84.55
## Mean :81.62
## 3rd Qu.:90.05
## Max. :97.80
## NA's :146
## Measles.immunization.coverage.among.one.year.olds.....lowest.educational.level.of.mother
## Min. :15.60
## 1st Qu.:44.58
## Median :62.85
## Mean :59.24
## 3rd Qu.:73.20
## Max. :96.00
## NA's :142
## Measles.immunization.coverage.among.one.year.olds.....lowest.wealth.quintile
## Min. : 8.20
## 1st Qu.:46.50
## Median :64.80
## Mean :60.29
## 3rd Qu.:76.62
## Max. :95.10
## NA's :146
## Measles.immunization.coverage.among.one.year.olds.....rural
## Min. :19.20
## 1st Qu.:55.17
## Median :70.95
## Mean :66.83
## 3rd Qu.:80.97
## Max. :96.50
## NA's :142
## Measles.immunization.coverage.among.one.year.olds.....urban
## Min. :37.50
## 1st Qu.:71.40
## Median :80.25
## Mean :77.86
## 3rd Qu.:86.03
## Max. :96.80
## NA's :142
## Measles.immunization.coverage.among.one.year.olds.difference.highest.lowest.educational.level.of.mother
## Min. :-7.60
## 1st Qu.:13.72
## Median :23.80
## Mean :24.19
## 3rd Qu.:33.98
## Max. :50.90
## NA's :142
## Measles.immunization.coverage.among.one.year.olds.difference.highest.lowest.wealth.quintile
## Min. :-11.20
## 1st Qu.: 11.00
## Median : 20.20
## Mean : 21.33
## 3rd Qu.: 33.25
## Max. : 57.20
## NA's :146
## Measles.immunization.coverage.among.one.year.olds.difference.urban.rural
## Min. :-13.400
## 1st Qu.: 4.225
## Median : 9.250
## Mean : 11.030
## 3rd Qu.: 18.075
## Max. : 38.100
## NA's :142
## Measles.immunization.coverage.among.one.year.olds.ratio.highest.lowest.educational.level.of.mother
## Min. :0.90
## 1st Qu.:1.20
## Median :1.40
## Mean :1.55
## 3rd Qu.:1.80
## Max. :4.30
## NA's :142
## Measles.immunization.coverage.among.one.year.olds.ratio.highest.lowest.wealth.quintile
## Min. :0.800
## 1st Qu.:1.100
## Median :1.300
## Mean :1.589
## 3rd Qu.:1.700
## Max. :4.600
## NA's :146
## Measles.immunization.coverage.among.one.year.olds.ratio.urban.rural
## Min. :0.800
## 1st Qu.:1.075
## Median :1.100
## Mean :1.227
## 3rd Qu.:1.300
## Max. :2.100
## NA's :142
## Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..difference.lowest.highest.educational.level.of.mother
## Min. : 1.40
## 1st Qu.: 38.20
## Median : 63.60
## Mean : 65.17
## 3rd Qu.: 85.00
## Max. :162.20
## NA's :141
## Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..difference.lowest.highest.wealth.quintile
## Min. :-11.00
## 1st Qu.: 37.55
## Median : 50.35
## Mean : 58.38
## 3rd Qu.: 73.50
## Max. :178.00
## NA's :146
## Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..difference.rural.urban
## Min. :-14.5
## 1st Qu.: 19.4
## Median : 30.3
## Mean : 33.1
## 3rd Qu.: 42.3
## Max. : 91.4
## NA's :141
## Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..highest.educational.level.of.mother
## Min. : 19.7
## 1st Qu.: 35.1
## Median : 64.7
## Mean : 62.6
## 3rd Qu.: 85.5
## Max. :143.0
## NA's :141
## Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..highest.wealth.quintile
## Min. : 15.80
## 1st Qu.: 30.05
## Median : 64.50
## Mean : 66.38
## 3rd Qu.: 92.10
## Max. :187.00
## NA's :146
## Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..lowest.educational.level.of.mother
## Min. : 31.0
## 1st Qu.: 83.6
## Median :122.6
## Mean :127.8
## 3rd Qu.:164.2
## Max. :269.4
## NA's :141
## Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..lowest.wealth.quintile
## Min. : 29.0
## 1st Qu.: 77.9
## Median :117.5
## Mean :124.8
## 3rd Qu.:173.0
## Max. :257.0
## NA's :146
## Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..ratio.lowest.highest.educational.level.of.mother
## Min. :1.000
## 1st Qu.:1.800
## Median :2.100
## Mean :2.156
## 3rd Qu.:2.500
## Max. :3.700
## NA's :141
## Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..ratio.lowest.highest.wealth.quintile
## Min. :0.900
## 1st Qu.:1.600
## Median :2.000
## Mean :2.211
## 3rd Qu.:2.925
## Max. :5.300
## NA's :146
## Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..ratio.rural.urban
## Min. :0.900
## 1st Qu.:1.300
## Median :1.400
## Mean :1.454
## 3rd Qu.:1.600
## Max. :2.200
## NA's :141
## Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..rural
## Min. : 29.6
## 1st Qu.: 69.4
## Median :111.0
## Mean :114.9
## 3rd Qu.:157.4
## Max. :253.2
## NA's :141
## Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..urban
## Min. : 16.20
## 1st Qu.: 44.80
## Median : 80.70
## Mean : 81.84
## 3rd Qu.:115.40
## Max. :184.60
## NA's :141
## Adult.mortality.rate..probability.of.dying.between.15.to.60.years.per.1000.population..both.sexes
## Min. : 48.0
## 1st Qu.:121.0
## Median :186.0
## Mean :222.6
## 3rd Qu.:282.0
## Max. :751.0
## NA's :9
## Adult.mortality.rate..probability.of.dying.between.15.to.60.years.per.1000.population..female
## Min. : 37.0
## 1st Qu.: 84.0
## Median :138.0
## Mean :184.3
## 3rd Qu.:239.0
## Max. :755.0
## NA's :9
## Adult.mortality.rate..probability.of.dying.between.15.to.60.years.per.1000.population..male
## Min. : 59
## 1st Qu.:151
## Median :233
## Mean :260
## 3rd Qu.:329
## Max. :798
## NA's :9
## Age.standardized.mortality.rate.for.cancer..per.100.000.population.
## Min. : 52.0
## 1st Qu.:112.5
## Median :133.0
## Mean :131.1
## 3rd Qu.:149.0
## Max. :306.0
## NA's :11
## Age.standardized.mortality.rate.for.cardiovascular.diseases..per.100.000.population.
## Min. :106.0
## 1st Qu.:257.5
## Median :393.0
## Mean :371.8
## 3rd Qu.:444.0
## Max. :844.0
## NA's :11
## Age.standardized.mortality.rate.for.injuries..per.100.000.population.
## Min. : 12.00
## 1st Qu.: 41.00
## Median : 69.00
## Mean : 81.16
## 3rd Qu.:104.50
## Max. :301.00
## NA's :11
## Age.standardized.mortality.rate.for.non.communicable.diseases..per.100.000.population.
## Min. : 287.0
## 1st Qu.: 565.0
## Median : 728.0
## Mean : 702.0
## 3rd Qu.: 831.5
## Max. :1269.0
## NA's :11
## Deaths.among.children.under.five.years.of.age.due.to.diarrhoeal.diseases....
## Min. : 0.000
## 1st Qu.: 1.100
## Median :10.600
## Mean : 9.309
## 3rd Qu.:15.600
## Max. :37.800
## NA's :15
## Deaths.among.children.under.five.years.of.age.due.to.HIV.AIDS....
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.300
## Mean : 3.323
## 3rd Qu.: 2.600
## Max. :57.100
## NA's :15
## Deaths.among.children.under.five.years.of.age.due.to.injuries....
## Min. : 0.000
## 1st Qu.: 2.350
## Median : 3.800
## Mean : 5.084
## 3rd Qu.: 6.700
## Max. :19.400
## NA's :15
## Deaths.among.children.under.five.years.of.age.due.to.malaria....
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 0.20
## Mean : 4.07
## 3rd Qu.: 0.90
## Max. :33.00
## NA's :15
## Deaths.among.children.under.five.years.of.age.due.to.measles....
## Min. :0.000
## 1st Qu.:0.000
## Median :0.100
## Mean :1.334
## 3rd Qu.:2.250
## Max. :8.100
## NA's :15
## Deaths.among.children.under.five.years.of.age.due.to.neonatal.causes....
## Min. : 2.80
## 1st Qu.:31.75
## Median :43.10
## Mean :43.20
## 3rd Qu.:52.80
## Max. :99.90
## NA's :15
## Deaths.among.children.under.five.years.of.age.due.to.other.causes....
## Min. : 0.00
## 1st Qu.:12.00
## Median :23.90
## Mean :22.44
## 3rd Qu.:32.15
## Max. :74.90
## NA's :15
## Deaths.among.children.under.five.years.of.age.due.to.pneumonia....
## Min. : 0.00
## 1st Qu.: 3.75
## Median :11.50
## Mean :11.24
## 3rd Qu.:18.45
## Max. :30.30
## NA's :15
## Deaths.due.to.HIV.AIDS..per.100.000.population.per.year.
## Min. : 1.0
## 1st Qu.: 10.0
## Median : 33.0
## Mean : 131.1
## 3rd Qu.: 129.0
## Max. :1550.0
## NA's :67
## Deaths.due.to.tuberculosis.among.HIV.negative.people..per.100.000.population.
## Min. : 0.00
## 1st Qu.: 2.00
## Median : 8.00
## Mean : 20.38
## 3rd Qu.: 32.00
## Max. :115.00
## NA's :9
## Deaths.due.to.tuberculosis.among.HIV.positive.people..per.100.000.population.
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 8.687
## 3rd Qu.: 5.000
## Max. :184.000
## NA's :55
## Healthy.life.expectancy..HALE..at.birth..years..both.sexes
## Min. :29.00
## 1st Qu.:50.00
## Median :60.00
## Mean :57.37
## 3rd Qu.:65.00
## Max. :75.00
## NA's :11
## Healthy.life.expectancy..HALE..at.birth..years..female
## Min. :30.00
## 1st Qu.:51.00
## Median :62.00
## Mean :58.92
## 3rd Qu.:67.50
## Max. :78.00
## NA's :11
## Healthy.life.expectancy..HALE..at.birth..years..male
## Min. :27.00
## 1st Qu.:49.00
## Median :58.00
## Mean :55.84
## 3rd Qu.:63.00
## Max. :72.00
## NA's :11
## Incidence.of.tuberculosis..per.100.000.population.per.year.
## Min. : 2.0
## 1st Qu.: 19.0
## Median : 62.0
## Mean : 142.9
## 3rd Qu.: 204.0
## Max. :1155.0
## NA's :9
## Infant.mortality.rate..per.1.000.live.births..both.sexes
## Min. : 2.00
## 1st Qu.: 9.00
## Median : 23.00
## Mean : 38.04
## 3rd Qu.: 59.00
## Max. :165.00
## NA's :9
## Infant.mortality.rate..per.1.000.live.births..female
## Min. : 2.0
## 1st Qu.: 9.0
## Median : 20.0
## Mean : 34.9
## 3rd Qu.: 54.0
## Max. :154.0
## NA's :9
## Infant.mortality.rate..per.1.000.live.births..male
## Min. : 3.00
## 1st Qu.: 9.00
## Median : 24.00
## Mean : 41.01
## 3rd Qu.: 63.00
## Max. :176.00
## NA's :9
## Life.expectancy.at.birth..years..both.sexes
## Min. :40.00
## 1st Qu.:61.00
## Median :70.00
## Mean :67.27
## 3rd Qu.:75.00
## Max. :83.00
## NA's :9
## Life.expectancy.at.birth..years..female Life.expectancy.at.birth..years..male
## Min. :42.00 Min. :39.00
## 1st Qu.:63.00 1st Qu.:59.00
## Median :73.00 Median :67.00
## Mean :69.72 Mean :64.92
## 3rd Qu.:78.00 3rd Qu.:72.00
## Max. :86.00 Max. :80.00
## NA's :9 NA's :9
## Maternal.mortality.ratio..per.100.000.live.births.
## Min. : 1.0
## 1st Qu.: 15.0
## Median : 130.0
## Mean : 322.4
## 3rd Qu.: 510.0
## Max. :2100.0
## NA's :33
## Neonatal.mortality.rate..per.1.000.live.births.
## Min. : 1.00
## 1st Qu.: 5.00
## Median :14.00
## Mean :19.82
## 3rd Qu.:32.00
## Max. :66.00
## NA's :11
## Number.of.confirmed.poliomyelitis.cases
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 25.74
## 3rd Qu.: 0.75
## Max. :756.00
## NA's :156
## Prevalence.of.HIV.among.adults.aged..gt..15.years..per.100.000.population.
## Min. : 52.0
## 1st Qu.: 115.5
## Median : 480.0
## Mean : 2291.7
## 3rd Qu.: 1819.5
## Max. :34457.0
## NA's :59
## Prevalence.of.tuberculosis..per.100.000.population.
## Min. : 2.0
## 1st Qu.: 24.0
## Median : 80.0
## Mean : 196.5
## 3rd Qu.: 299.0
## Max. :1300.0
## NA's :9
## Under.5.mortality.rate..probability.of.dying.by.age.5.per.1000.live.births..both.sexes
## Min. : 3.00
## 1st Qu.: 10.00
## Median : 26.00
## Mean : 54.79
## 3rd Qu.: 76.00
## Max. :269.00
## NA's :9
## Under.5.mortality.rate..probability.of.dying.by.age.5.per.1000.live.births..female
## Min. : 2.00
## 1st Qu.: 10.00
## Median : 25.00
## Mean : 51.52
## 3rd Qu.: 75.00
## Max. :254.00
## NA's :9
## Under.5.mortality.rate..probability.of.dying.by.age.5.per.1000.live.births..male
## Min. : 3.00
## 1st Qu.: 10.00
## Median : 28.00
## Mean : 57.86
## 3rd Qu.: 80.00
## Max. :286.00
## NA's :9
## Years.of.life.lost.to.communicable.diseases....
## Min. : 3.00
## 1st Qu.:11.00
## Median :31.00
## Mean :39.34
## 3rd Qu.:68.50
## Max. :93.00
## NA's :11
## Years.of.life.lost.to.injuries....
## Min. : 2.00
## 1st Qu.: 9.00
## Median :12.00
## Mean :13.24
## 3rd Qu.:17.00
## Max. :40.00
## NA's :11
## Years.of.life.lost.to.non.communicable.diseases....
## Min. : 4.00
## 1st Qu.:21.50
## Median :52.00
## Mean :47.46
## 3rd Qu.:70.50
## Max. :87.00
## NA's :11
## Children.under.five.years.of.age.overweight.for.age....
## Min. : 0.600
## 1st Qu.: 4.400
## Median : 5.900
## Mean : 7.578
## 3rd Qu.: 9.200
## Max. :30.000
## NA's :91
## Children.under.five.years.of.age.stunted.for.age....
## Min. : 1.20
## 1st Qu.:18.18
## Median :31.25
## Mean :30.91
## 3rd Qu.:43.88
## Max. :63.10
## NA's :90
## Children.under.five.years.of.age.underweight.for.age....
## Min. : 1.100
## 1st Qu.: 5.025
## Median :14.800
## Mean :16.235
## 3rd Qu.:23.650
## Max. :43.500
## NA's :90
## Newborns.with.low.birth.weight....
## Min. : 3.00
## 1st Qu.: 7.00
## Median : 9.00
## Mean :10.76
## 3rd Qu.:13.75
## Max. :32.00
## NA's :28
## Per.capita.recorded.alcohol.consumption..litres.of.pure.alcohol..among.adults...gt..15.years.
## Min. : 0.000
## 1st Qu.: 1.230
## Median : 3.725
## Mean : 4.519
## 3rd Qu.: 6.982
## Max. :15.560
## NA's :22
## Population.using.solid.fuels.....rural Population.using.solid.fuels.....urban
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 34.75 1st Qu.: 3.75
## Median : 82.50 Median :23.50
## Mean : 65.58 Mean :34.01
## 3rd Qu.: 98.00 3rd Qu.:63.75
## Max. :100.00 Max. :99.00
## NA's :130 NA's :130
## Population.with.sustainable.access.to.improved.drinking.water.sources.....rural
## Min. : 10.00
## 1st Qu.: 61.00
## Median : 82.00
## Mean : 77.21
## 3rd Qu.: 97.00
## Max. :100.00
## NA's :23
## Population.with.sustainable.access.to.improved.drinking.water.sources.....total
## Min. : 22.00
## 1st Qu.: 73.00
## Median : 91.00
## Mean : 84.04
## 3rd Qu.: 99.00
## Max. :100.00
## NA's :24
## Population.with.sustainable.access.to.improved.drinking.water.sources.....urban
## Min. : 37.00
## 1st Qu.: 90.00
## Median : 98.00
## Mean : 92.73
## 3rd Qu.:100.00
## Max. :100.00
## NA's :17
## Population.with.sustainable.access.to.improved.sanitation.....rural
## Min. : 3.00
## 1st Qu.: 30.75
## Median : 62.50
## Mean : 60.62
## 3rd Qu.: 95.00
## Max. :100.00
## NA's :30
## Population.with.sustainable.access.to.improved.sanitation.....total
## Min. : 5.00
## 1st Qu.: 41.00
## Median : 78.00
## Mean : 67.67
## 3rd Qu.: 95.00
## Max. :100.00
## NA's :33
## Population.with.sustainable.access.to.improved.sanitation.....urban
## Min. : 14.00
## 1st Qu.: 58.50
## Median : 88.00
## Mean : 77.03
## 3rd Qu.: 98.00
## Max. :100.00
## NA's :27
## Prevalence.of.adults...gt..15.years..who.are.obese.....female
## Min. : 0.70
## 1st Qu.: 6.00
## Median :12.50
## Mean :15.19
## 3rd Qu.:19.00
## Max. :74.90
## NA's :111
## Prevalence.of.adults...gt..15.years..who.are.obese.....male
## Min. : 0.70
## 1st Qu.: 8.35
## Median :13.25
## Mean :15.72
## 3rd Qu.:18.23
## Max. :57.40
## NA's :148
## Prevalence.of.condom.use.by.young.people..15.24.years..at.higher.risk.sex.....female
## Min. : 5.00
## 1st Qu.:21.00
## Median :29.50
## Mean :32.27
## 3rd Qu.:41.75
## Max. :75.00
## NA's :172
## Prevalence.of.condom.use.by.young.people..15.24.years..at.higher.risk.sex.....male
## Min. :12.00
## 1st Qu.:38.00
## Median :47.00
## Mean :47.20
## 3rd Qu.:54.75
## Max. :88.00
## NA's :172
## Prevalence.of.current.tobacco.use.among.adolescents..13.15.years......both.sexes
## Min. : 2.20
## 1st Qu.:11.70
## Median :17.20
## Mean :18.78
## 3rd Qu.:23.80
## Max. :59.70
## NA's :65
## Prevalence.of.current.tobacco.use.among.adolescents..13.15.years......female
## Min. : 1.00
## 1st Qu.: 8.20
## Median :13.20
## Mean :15.38
## 3rd Qu.:20.40
## Max. :54.10
## NA's :65
## Prevalence.of.current.tobacco.use.among.adolescents..13.15.years......male
## Min. : 3.20
## 1st Qu.:14.90
## Median :20.50
## Mean :21.92
## 3rd Qu.:27.80
## Max. :65.80
## NA's :65
## Prevalence.of.current.tobacco.use.among.adults...gt..15.years......both.sexes
## Min. : 4.30
## 1st Qu.:15.10
## Median :25.60
## Mean :24.77
## 3rd Qu.:31.90
## Max. :51.80
## NA's :73
## Prevalence.of.current.tobacco.use.among.adults...gt..15.years......female
## Min. : 0.30
## 1st Qu.: 3.40
## Median : 9.80
## Mean :14.05
## 3rd Qu.:24.50
## Max. :52.40
## NA's :71
## Prevalence.of.current.tobacco.use.among.adults...gt..15.years......male
## Min. : 7.60
## 1st Qu.:25.95
## Median :34.80
## Mean :35.40
## 3rd Qu.:44.15
## Max. :70.10
## NA's :71
## Adolescent_fertility_rate Agricultural_land
## Min. : 1.46 Min. : 0.58
## 1st Qu.: 18.34 1st Qu.:23.95
## Median : 42.08 Median :40.21
## Mean : 55.87 Mean :40.39
## 3rd Qu.: 80.68 3rd Qu.:57.94
## Max. :225.50 Max. :90.58
## NA's :21 NA's :17
## Agriculture_contribution_to_economy Aid_given Aid_received
## Min. : 0.000 Min. : 212.0 Min. : -65.940
## 1st Qu.: 3.692 1st Qu.: 766.2 1st Qu.: 8.275
## Median :10.160 Median : 2118.0 Median : 35.320
## Mean :15.074 Mean : 3614.2 Mean : 86.615
## 3rd Qu.:22.445 3rd Qu.: 3833.5 3rd Qu.: 66.547
## Max. :65.970 Max. :19705.0 Max. :1514.510
## NA's :24 NA's :180 NA's :48
## Aid_received_total All_forms_of_TB_new_cases_per_100_000_estimated
## Min. :-1.650e+08 Min. : 3.00
## 1st Qu.: 4.728e+07 1st Qu.: 21.25
## Median : 1.995e+08 Median : 61.00
## Mean : 5.640e+08 Mean : 143.75
## 3rd Qu.: 5.440e+08 3rd Qu.: 204.00
## Max. : 2.210e+10 Max. :1141.00
## NA's :38 NA's :28
## All_forms_of_TB_new_cases_per_100_000_reported
## Min. : 0.00
## 1st Qu.: 13.25
## Median : 41.00
## Mean : 83.83
## 3rd Qu.:100.75
## Max. :739.00
## NA's :28
## Annual_freshwater_withdrawals_total Arms_exports Arms_imports
## Min. : 0.00 Min. :0.000e+00 Min. :0.000e+00
## 1st Qu.: 1.47 1st Qu.:4.000e+06 1st Qu.:5.000e+06
## Median : 6.58 Median :1.250e+07 Median :1.800e+07
## Mean : 86.86 Mean :2.852e+08 Mean :1.382e+08
## 3rd Qu.: 34.19 3rd Qu.:5.825e+07 3rd Qu.:9.600e+07
## Max. :3794.44 Max. :7.090e+09 Max. :3.790e+09
## NA's :49 NA's :120 NA's :37
## Bad_teeth_per_child Births_attended_by_skilled_health_staff
## Min. :0.180 Min. : 5.70
## 1st Qu.:1.110 1st Qu.: 60.62
## Median :1.700 Median : 91.65
## Mean :2.025 Mean : 79.08
## 3rd Qu.:2.800 3rd Qu.: 99.40
## Max. :6.000 Max. :100.00
## NA's :27 NA's :28
## Breast_cancer_deaths_per_100_000_women
## Min. : 2.00
## 1st Qu.:10.38
## Median :14.20
## Mean :14.29
## 3rd Qu.:18.01
## Max. :29.60
## NA's :30
## Breast_cancer_new_cases_per_100_000_women
## Min. : 3.90
## 1st Qu.: 20.55
## Median : 29.90
## Mean : 37.21
## 3rd Qu.: 49.73
## Max. :101.10
## NA's :30
## Breast_cancer_number_of_female_deaths Breast_cancer_number_of_new_female_cases
## Min. : 7 Min. : 16
## 1st Qu.: 156 1st Qu.: 293
## Median : 549 Median : 1177
## Mean : 2385 Mean : 6689
## 3rd Qu.: 1598 3rd Qu.: 3862
## Max. :44795 Max. :209995
## NA's :31 NA's :31
## Broadband_subscribers Broadband_subscribers_per_100_people CO2_emissions
## Min. : 0 Min. : 0.000 Min. : 0.0100
## 1st Qu.: 192 1st Qu.: 0.065 1st Qu.: 0.6275
## Median : 8970 Median : 0.825 Median : 2.3850
## Mean : 1122746 Mean : 4.453 Mean : 5.1146
## 3rd Qu.: 168902 3rd Qu.: 4.225 3rd Qu.: 7.0925
## Max. :48000000 Max. :29.080 Max. :57.7200
## NA's :14 NA's :50 NA's :16
## CO2_intensity_of_economic_output Capital_formation
## Min. :0.010 Min. :0.000e+00
## 1st Qu.:0.190 1st Qu.:4.240e+09
## Median :0.310 Median :6.170e+10
## Mean :0.415 Mean :7.202e+12
## 3rd Qu.:0.500 3rd Qu.:3.700e+11
## Max. :2.810 Max. :3.900e+14
## NA's :29 NA's :41
## Cell_phones_per_100_people Cell_phones_total Central_bank_discount_rate
## Min. : 0.30 Min. : 0 Min. : 0.00
## 1st Qu.: 9.75 1st Qu.: 177250 1st Qu.: 4.00
## Median : 37.50 Median : 905542 Median : 7.00
## Mean : 44.61 Mean : 9165871 Mean : 13.72
## 3rd Qu.: 77.77 3rd Qu.: 4787298 3rd Qu.: 12.00
## Max. :154.80 Max. :335000000 Max. :540.00
## NA's :15 NA's :14 NA's :61
## Cervical_cancer_deaths_per_100_000_women
## Min. : 0.640
## 1st Qu.: 3.507
## Median : 8.050
## Mean :12.914
## 3rd Qu.:23.000
## Max. :55.600
## NA's :30
## Cervical_cancer_new_cases_per_100_000_women
## Min. : 2.00
## 1st Qu.:10.88
## Median :20.25
## Mean :23.18
## 3rd Qu.:30.43
## Max. :87.30
## NA's :30
## Cervical_cancer_number_of_female_deaths
## Min. : 3
## 1st Qu.: 111
## Median : 354
## Mean : 1590
## 3rd Qu.: 1287
## Max. :74118
## NA's :31
## Cervical_cancer_number_of_new_female_cases Children_and_elderly
## Min. : 5.0 Min. : 26.50
## 1st Qu.: 220.5 1st Qu.: 48.28
## Median : 654.0 Median : 57.20
## Mean : 2865.7 Mean : 62.84
## 3rd Qu.: 1956.5 3rd Qu.: 78.20
## Max. :132082.0 Max. :107.67
## NA's :31 NA's :34
## Children_out_of_school_primary Children_out_of_school_primary_female
## Min. : 37 Min. : 0
## 1st Qu.: 8656 1st Qu.: 2688
## Median : 34216 Median : 15064
## Mean : 360415 Mean : 202990
## 3rd Qu.: 223811 3rd Qu.: 110107
## Max. :8096824 Max. :4712631
## NA's :38 NA's :40
## Children_out_of_school_primary_male Children_per_woman Coal_consumption
## Min. : 0 Min. :0.910 Min. : 0.05
## 1st Qu.: 2732 1st Qu.:1.815 1st Qu.: 1.48
## Median : 15645 Median :2.400 Median : 6.06
## Mean : 167267 Mean :2.966 Mean : 47.88
## 3rd Qu.: 107337 3rd Qu.:3.842 3rd Qu.: 26.68
## Max. :3549652 Max. :7.190 Max. :1088.80
## NA's :38 NA's :34 NA's :143
## Coal_consumption_per_person Coal_production Coal_production_per_person
## Min. :0.0000 Min. : 0.240 Min. : 0.00
## 1st Qu.:0.0550 1st Qu.: 4.138 1st Qu.: 0.15
## Median :0.3400 Median : 12.640 Median : 0.30
## Mean :0.5215 Mean : 88.892 Mean : 0.97
## 3rd Qu.:0.7800 3rd Qu.: 57.102 3rd Qu.: 0.90
## Max. :2.7300 Max. :1119.830 Max. :10.28
## NA's :144 NA's :170 NA's :171
## Colon_and_Rectum_cancer_deaths_per_100_000_men
## Min. : 0.600
## 1st Qu.: 4.000
## Median : 6.270
## Mean : 8.958
## 3rd Qu.:13.550
## Max. :34.070
## NA's :30
## Colon_and_Rectum_cancer_deaths_per_100_000_women
## Min. : 0.600
## 1st Qu.: 3.075
## Median : 5.250
## Mean : 6.198
## 3rd Qu.: 9.207
## Max. :17.040
## NA's :30
## Colon_and_Rectum_cancer_new_cases_per_100_000_men
## Min. : 1.00
## 1st Qu.: 5.10
## Median :10.25
## Mean :16.20
## 3rd Qu.:25.65
## Max. :58.50
## NA's :30
## Colon_and_Rectum_cancer_new_cases_per_100_000_women
## Min. : 0.90
## 1st Qu.: 3.80
## Median : 8.10
## Mean :12.03
## 3rd Qu.:18.12
## Max. :42.20
## NA's :30
## Colon_and_Rectum_cancer_number_of_female_deaths
## Min. : 0.0
## 1st Qu.: 51.5
## Median : 193.0
## Mean : 1453.9
## 3rd Qu.: 815.0
## Max. :35902.0
## NA's :31
## Colon_and_Rectum_cancer_number_of_male_deaths
## Min. : 2.0
## 1st Qu.: 58.5
## Median : 216.0
## Mean : 1614.7
## 3rd Qu.: 980.5
## Max. :50200.0
## NA's :31
## Colon_and_Rectum_cancer_number_of_new_female_cases
## Min. : 1
## 1st Qu.: 68
## Median : 282
## Mean : 2746
## 3rd Qu.: 1422
## Max. :80427
## NA's :31
## Colon_and_Rectum_cancer_number_of_new_male_cases Consumer_price_index
## Min. : 2.0 Min. : 80.0
## 1st Qu.: 69.5 1st Qu.: 112.0
## Median : 339.0 Median : 119.0
## Mean : 3195.5 Mean : 150.4
## 3rd Qu.: 1524.0 3rd Qu.: 142.0
## Max. :88142.0 Max. :1873.0
## NA's :31 NA's :45
## Contraceptive_use Deaths_from_TB_per_100_000_estimated Debt_servicing_costs
## Min. : 2.80 Min. : 0.00 Min. : 1.000
## 1st Qu.:30.40 1st Qu.: 3.00 1st Qu.: 4.000
## Median :49.60 Median : 8.50 Median : 7.000
## Mean :47.77 Mean : 27.53 Mean : 8.923
## 3rd Qu.:66.97 3rd Qu.: 37.00 3rd Qu.:11.000
## Max. :96.00 Max. :271.00 Max. :71.000
## NA's :40 NA's :28 NA's :72
## Democracy_score Electric_power_consumption Electricity_generation
## Min. :-10.000 Min. : 34.39 Min. : 8.68
## 1st Qu.: -3.000 1st Qu.: 717.84 1st Qu.: 44.05
## Median : 6.000 Median : 2116.55 Median : 91.16
## Mean : 3.355 Mean : 4021.08 Mean : 276.55
## 3rd Qu.: 9.000 3rd Qu.: 5726.85 3rd Qu.: 230.22
## Max. : 10.000 Max. :27986.52 Max. :4257.37
## NA's :47 NA's :71 NA's :139
## Electricity_generation_per_person Energy_use
## Min. : 156.9 Min. : 157.8
## 1st Qu.: 2540.4 1st Qu.: 632.2
## Median : 5548.2 Median : 1444.7
## Mean : 6990.5 Mean : 2702.9
## 3rd Qu.: 8514.2 3rd Qu.: 3635.9
## Max. :30061.3 Max. :19877.3
## NA's :140 NA's :71
## Expenditure_per_student_primary Expenditure_per_student_secondary
## Min. : 0.91 Min. : 2.44
## 1st Qu.: 9.37 1st Qu.: 13.01
## Median :14.07 Median : 19.74
## Mean :14.59 Mean : 21.86
## 3rd Qu.:19.19 3rd Qu.: 26.37
## Max. :37.26 Max. :100.87
## NA's :55 NA's :65
## Expenditure_per_student_tertiary Exports_of_goods_and_services
## Min. : 0.00 Min. : 0.42
## 1st Qu.: 23.50 1st Qu.: 26.04
## Median : 36.60 Median : 37.98
## Mean : 97.75 Mean : 44.12
## 3rd Qu.: 90.07 3rd Qu.: 54.87
## Max. :1145.67 Max. :244.30
## NA's :77 NA's :20
## Exports_unit_value External_debt_total_DOD_current_USdollars
## Min. : 50.81 Min. :8.180e+07
## 1st Qu.: 96.06 1st Qu.:1.575e+09
## Median :115.92 Median :4.640e+09
## Mean :118.28 Mean :2.079e+10
## 3rd Qu.:139.06 3rd Qu.:1.695e+10
## Max. :205.80 Max. :2.820e+11
## NA's :135 NA's :71
## External_debt_total_pct_of_GNI Female_labour_force
## Min. : 4.91 Min. :13.10
## 1st Qu.: 33.54 1st Qu.:36.96
## Median : 51.58 Median :41.99
## Mean : 65.10 Mean :40.24
## 3rd Qu.: 76.17 3rd Qu.:46.19
## Max. :619.18 Max. :53.45
## NA's :73 NA's :36
## Fixed_line_and_mobile_phone_subscribers Foreign_direct_investment_net_inflows
## Min. : 0.28 Min. :-15.130
## 1st Qu.: 12.57 1st Qu.: 1.165
## Median : 52.05 Median : 3.000
## Mean : 62.80 Mean : 6.247
## 3rd Qu.:111.94 3rd Qu.: 6.245
## Max. :211.17 Max. :312.670
## NA's :14 NA's :31
## Foreign_direct_investment_net_outflows Forest_area
## Min. : -5.710 Min. : 10
## 1st Qu.: 0.000 1st Qu.: 4090
## Median : 0.100 Median : 27540
## Mean : 3.395 Mean : 215241
## 3rd Qu.: 1.030 3rd Qu.: 110825
## Max. :332.380 Max. :8087900
## NA's :41 NA's :19
## Gross_capital_formation HIV_infected Health_expenditure_per_person
## Min. : 8.07 Min. : 0.010 Min. : 0.23
## 1st Qu.:18.38 1st Qu.: 0.110 1st Qu.: 33.25
## Median :22.41 Median : 0.500 Median : 157.00
## Mean :23.77 Mean : 2.085 Mean : 693.04
## 3rd Qu.:27.71 3rd Qu.: 1.530 3rd Qu.: 486.75
## Max. :63.23 Max. :26.350 Max. :6657.00
## NA's :22 NA's :55 NA's :22
## Health_expenditure_private Health_expenditure_public_pct_of_GDP
## Min. :0.330 Min. : 0.260
## 1st Qu.:1.490 1st Qu.: 1.962
## Median :2.275 Median : 3.190
## Mean :2.463 Mean : 3.627
## 3rd Qu.:3.192 3rd Qu.: 4.680
## Max. :8.330 Max. :11.730
## NA's :22 NA's :22
## Health_expenditure_public_pct_of_government_expenditure
## Min. : 0.70
## 1st Qu.: 7.00
## Median :10.25
## Mean :10.56
## 3rd Qu.:13.53
## Max. :36.30
## NA's :22
## Health_expenditure_public_pct_of_total_health_expenditure
## Min. :11.60
## 1st Qu.:44.05
## Median :60.55
## Mean :57.67
## 3rd Qu.:73.03
## Max. :92.40
## NA's :22
## Health_expenditure_total High_technology_exports Hydroelectricity_consumption
## Min. : 1.700 Min. : 0.000 Min. : 0.13
## 1st Qu.: 4.275 1st Qu.: 0.835 1st Qu.: 1.22
## Median : 5.600 Median : 4.755 Median : 2.94
## Mean : 6.076 Mean : 9.480 Mean :11.04
## 3rd Qu.: 7.600 3rd Qu.:12.730 3rd Qu.: 8.62
## Max. :15.900 Max. :70.730 Max. :89.84
## NA's :22 NA's :36 NA's :147
## Hydroelectricity_consumption_per_person Imports_of_goods_and_services
## Min. :0.0000 Min. : 1.05
## 1st Qu.:0.0425 1st Qu.: 31.10
## Median :0.1200 Median : 43.18
## Mean :0.4782 Mean : 49.30
## 3rd Qu.:0.2100 3rd Qu.: 62.16
## Max. :6.7300 Max. :214.82
## NA's :148 NA's :20
## Imports_unit_value Improved_sanitation_facilities_urban Improved_water_source
## Min. : 47.0 Min. : 24.00 Min. : 22.00
## 1st Qu.:108.0 1st Qu.: 59.00 1st Qu.: 67.00
## Median :121.0 Median : 79.50 Median : 85.00
## Mean :123.5 Mean : 76.63 Mean : 80.36
## 3rd Qu.:137.5 3rd Qu.: 97.00 3rd Qu.: 97.00
## Max. :330.0 Max. :100.00 Max. :100.00
## NA's :151 NA's :40 NA's :32
## Income_growth Income_per_person Income_share_held_by_lowest_20pct
## Min. :-7.500 Min. : 264 Min. : 1.400
## 1st Qu.: 1.137 1st Qu.: 1985 1st Qu.: 4.805
## Median : 3.235 Median : 6461 Median : 6.255
## Mean : 3.449 Mean :11865 Mean : 6.139
## 3rd Qu.: 4.680 3rd Qu.:16268 3rd Qu.: 7.465
## Max. :24.970 Max. :70014 Max. :10.580
## NA's :32 NA's :11 NA's :72
## Industry_contribution_to_economy Inequality_index Infant_mortality_rate
## Min. : 7.09 Min. :24.70 Min. : 2.00
## 1st Qu.:22.79 1st Qu.:34.00 1st Qu.: 8.50
## Median :28.29 Median :39.45 Median : 24.00
## Mean :30.81 Mean :40.74 Mean : 40.47
## 3rd Qu.:35.09 3rd Qu.:47.01 3rd Qu.: 64.00
## Max. :94.21 Max. :74.33 Max. :165.00
## NA's :26 NA's :72 NA's :35
## Infectious_TB_new_cases_per_100_000_estimated
## Min. : 2.0
## 1st Qu.: 10.0
## Median : 28.0
## Mean : 63.0
## 3rd Qu.: 93.5
## Max. :452.0
## NA's :27
## Infectious_TB_new_cases_per_100_000_reported
## Min. : 0.00
## 1st Qu.: 7.00
## Median : 21.00
## Mean : 35.82
## 3rd Qu.: 47.50
## Max. :262.00
## NA's :27
## Infectious_TB_treatment_completeness Inflation_GDP_deflator Internet_users
## Min. : 28 Min. : -8.390 Min. : 0.00
## 1st Qu.: 72 1st Qu.: 2.592 1st Qu.: 2.90
## Median : 80 Median : 5.495 Median : 8.40
## Mean : 78 Mean : 10.490 Mean :17.78
## 3rd Qu.: 86 3rd Qu.: 11.915 3rd Qu.:27.10
## Max. :100 Max. :237.950 Max. :76.20
## NA's :44 NA's :16 NA's :15
## Life_expectancy_at_birth Literacy_rate_adult_female Literacy_rate_adult_male
## Min. :40.68 Min. :12.59 Min. :31.44
## 1st Qu.:59.47 1st Qu.:59.66 1st Qu.:74.00
## Median :71.33 Median :80.79 Median :88.32
## Mean :67.47 Mean :73.54 Mean :82.68
## 3rd Qu.:76.02 3rd Qu.:93.69 3rd Qu.:95.93
## Max. :82.27 Max. :99.79 Max. :99.81
## NA's :34 NA's :66 NA's :66
## Literacy_rate_adult_total Literacy_rate_youth_female Literacy_rate_youth_male
## Min. :23.55 Min. :16.86 Min. :32.25
## 1st Qu.:67.45 1st Qu.:71.08 1st Qu.:80.89
## Median :84.68 Median :95.45 Median :95.72
## Mean :77.98 Mean :83.31 Mean :87.92
## 3rd Qu.:93.83 3rd Qu.:98.84 3rd Qu.:98.88
## Max. :99.80 Max. :99.95 Max. :99.96
## NA's :65 NA's :70 NA's :70
## Literacy_rate_youth_total Liver_cancer_deaths_per_100_000_men
## Min. :24.19 Min. : 0.710
## 1st Qu.:75.81 1st Qu.: 4.015
## Median :95.59 Median : 6.030
## Mean :85.53 Mean :10.748
## 3rd Qu.:98.87 3rd Qu.:14.425
## Max. :99.96 Max. :93.300
## NA's :70 NA's :30
## Liver_cancer_deaths_per_100_000_women Liver_cancer_new_cases_per_100_000_men
## Min. : 0.200 Min. : 0.80
## 1st Qu.: 1.900 1st Qu.: 3.70
## Median : 3.170 Median : 6.10
## Mean : 4.738 Mean :11.39
## 3rd Qu.: 5.500 3rd Qu.:15.30
## Max. :47.300 Max. :98.90
## NA's :30 NA's :30
## Liver_cancer_new_cases_per_100_000_women Liver_cancer_number_of_female_deaths
## Min. : 0.200 Min. : 0.5
## 1st Qu.: 1.900 1st Qu.: 41.5
## Median : 3.000 Median : 147.0
## Mean : 4.916 Mean : 1050.1
## 3rd Qu.: 5.600 3rd Qu.: 408.5
## Max. :57.300 Max. :89055.0
## NA's :30 NA's :31
## Liver_cancer_number_of_male_deaths Liver_cancer_number_of_new_female_cases
## Min. : 2 Min. : 0
## 1st Qu.: 70 1st Qu.: 41
## Median : 253 Median : 125
## Mean : 2410 Mean : 1065
## 3rd Qu.: 819 3rd Qu.: 423
## Max. :232796 Max. :94937
## NA's :31 NA's :31
## Liver_cancer_number_of_new_male_cases Lung_cancer_deaths_per_100_000_men
## Min. : 2.0 Min. : 0.400
## 1st Qu.: 67.5 1st Qu.: 6.435
## Median : 244.0 Median :16.020
## Mean : 2554.0 Mean :21.095
## 3rd Qu.: 799.0 3rd Qu.:31.457
## Max. :250907.0 Max. :78.660
## NA's :31 NA's :30
## Lung_cancer_deaths_per_100_000_women Lung_cancer_new_cases_per_100_000_men
## Min. : 0.100 Min. : 0.500
## 1st Qu.: 2.100 1st Qu.: 7.425
## Median : 4.515 Median :18.750
## Mean : 5.719 Mean :25.281
## 3rd Qu.: 7.405 3rd Qu.:39.850
## Max. :27.800 Max. :94.600
## NA's :30 NA's :30
## Lung_cancer_new_cases_per_100_000_women Lung_cancer_number_of_female_deaths
## Min. : 0.100 Min. : 0
## 1st Qu.: 2.175 1st Qu.: 21
## Median : 5.300 Median : 126
## Mean : 6.863 Mean : 1917
## 3rd Qu.: 9.400 3rd Qu.: 742
## Max. :36.100 Max. :109059
## NA's :30 NA's :31
## Lung_cancer_number_of_male_deaths Lung_cancer_number_of_new_female_cases
## Min. : 1 Min. : 0
## 1st Qu.: 79 1st Qu.: 22
## Median : 441 Median : 134
## Mean : 4905 Mean : 2242
## 3rd Qu.: 2108 3rd Qu.: 794
## Max. :231301 Max. :126718
## NA's :31 NA's :31
## Lung_cancer_number_of_new_male_cases
## Min. : 1.0
## 1st Qu.: 82.5
## Median : 483.0
## Mean : 5579.2
## 3rd Qu.: 2414.5
## Max. :269650.0
## NA's :31
## Malaria_prevention_insecticide_treated_bed_nets_usage Malaria_treatment
## Min. : 0.100 Min. : 0.70
## 1st Qu.: 1.200 1st Qu.:13.05
## Median : 3.050 Median :35.90
## Mean : 6.485 Mean :35.61
## 3rd Qu.: 7.400 3rd Qu.:57.23
## Max. :53.600 Max. :68.90
## NA's :156 NA's :156
## Malnutrition_weight_for_age Market_value_of_listed_companies
## Min. : 1.10 Min. : 0.09
## 1st Qu.: 6.10 1st Qu.: 15.92
## Median :15.30 Median : 35.48
## Mean :16.79 Mean : 63.37
## 3rd Qu.:24.40 3rd Qu.: 87.94
## Max. :47.60 Max. :593.25
## NA's :101 NA's :84
## Maternal_mortality Math_achievement_4th_grade Math_achievement_8th_grade
## Min. : 0.00 Min. :331.0 Min. :298.0
## 1st Qu.: 25.13 1st Qu.:494.1 1st Qu.:443.9
## Median : 100.00 Median :520.8 Median :483.3
## Mean : 305.62 Mean :509.2 Mean :477.1
## 3rd Qu.: 480.00 3rd Qu.:535.9 3rd Qu.:506.7
## Max. :2000.00 Max. :597.3 Max. :597.0
## NA's :25 NA's :182 NA's :172
## Measles_immunization Medical_Doctors Merchandise_trade Military_expenditure
## Min. :23.00 Min. :0.0200 Min. : 12.97 Min. : 0.000
## 1st Qu.:79.00 1st Qu.:0.2075 1st Qu.: 47.07 1st Qu.: 1.060
## Median :90.00 Median :1.1000 Median : 63.97 Median : 1.620
## Mean :84.94 Mean :1.4057 Mean : 73.86 Mean : 2.271
## 3rd Qu.:97.00 3rd Qu.:2.3100 3rd Qu.: 88.86 3rd Qu.: 2.630
## Max. :99.00 Max. :5.9100 Max. :368.19 Max. :24.110
## NA's :21 NA's :30 NA's :21 NA's :47
## Natural_gas_consumption Natural_gas_consumption_per_person
## Min. : 0.260 Min. : 19.46
## 1st Qu.: 6.162 1st Qu.: 423.87
## Median : 18.150 Median : 1021.80
## Mean : 47.488 Mean : 1942.36
## 3rd Qu.: 41.945 3rd Qu.: 1586.02
## Max. :623.280 Max. :21667.32
## NA's :154 NA's :155
## Natural_gas_production Natural_gas_production_per_person
## Min. : 3.55 Min. : 27.42
## 1st Qu.: 11.93 1st Qu.: 280.22
## Median : 25.90 Median : 1248.79
## Mean : 58.89 Mean : 4490.66
## 3rd Qu.: 57.82 3rd Qu.: 2708.02
## Max. :597.96 Max. :53067.55
## NA's :156 NA's :156
## Natural_gas_proved_reserves Natural_gas_proven_reserves_per_person
## Min. : 0.090 Min. : 1.02
## 1st Qu.: 0.410 1st Qu.: 7.96
## Median : 1.100 Median : 40.03
## Mean : 3.503 Mean : 775.75
## 3rd Qu.: 2.480 3rd Qu.: 159.17
## Max. :44.610 Max. :29703.92
## NA's :153 NA's :153
## Net_barter_terms_of_trade Nuclear_consumption Nuclear_consumption_per_person
## Min. : 77.22 Min. : 0.070 Min. :0.0000
## 1st Qu.: 92.74 1st Qu.: 2.575 1st Qu.:0.0600
## Median :101.93 Median : 5.530 Median :0.4000
## Mean :110.11 Mean : 20.836 Mean :0.4628
## 3rd Qu.:122.72 3rd Qu.: 19.685 3rd Qu.:0.6500
## Max. :181.79 Max. :186.260 Max. :1.8200
## NA's :66 NA's :172 NA's :173
## Number_of_deaths_from_TB_estimated Number_of_existing_TB_cases_estimated
## Min. : 0 Min. : 4
## 1st Qu.: 55 1st Qu.: 562
## Median : 733 Median : 6025
## Mean : 9275 Mean : 80633
## 3rd Qu.: 4440 3rd Qu.: 38912
## Max. :330695 Max. :3511772
## NA's :27 NA's :28
## Oil_consumption Oil_consumption_per_person Oil_production
## Min. : 20.82 Min. : 0.240 Min. : 72.68
## 1st Qu.: 207.11 1st Qu.: 4.253 1st Qu.: 295.00
## Median : 330.79 Median : 7.825 Median : 734.44
## Mean : 1200.18 Mean :12.133 Mean : 1670.16
## 3rd Qu.: 1231.80 3rd Qu.:15.545 3rd Qu.: 2155.64
## Max. :20802.18 Max. :65.510 Max. :11114.43
## NA's :137 NA's :138 NA's :154
## Oil_production_per_person Oil_proved_reserves Oil_proven_reserves_per_person
## Min. :0.00000 Min. : 0.45 Min. : 5.48
## 1st Qu.:0.00000 1st Qu.: 1.86 1st Qu.: 55.56
## Median :0.01000 Median : 5.25 Median : 219.26
## Mean :0.05794 Mean : 25.84 Mean : 3411.56
## 3rd Qu.:0.03250 3rd Qu.: 21.97 3rd Qu.: 1938.87
## Max. :0.44000 Max. :264.21 Max. :43456.89
## NA's :154 NA's :155 NA's :155
## Old_version_of_Income_per_person Patent_applications Patents_granted
## Min. : 225.7 Min. : 1 Min. : 0.38
## 1st Qu.: 1512.1 1st Qu.: 1036 1st Qu.: 35.25
## Median : 4472.3 Median : 89636 Median : 240.50
## Mean : 8048.3 Mean : 93929 Mean : 6791.59
## 3rd Qu.:11950.1 3rd Qu.:157931 3rd Qu.: 2243.25
## Max. :43696.8 Max. :486906 Max. :167334.00
## NA's :28 NA's :59 NA's :68
## Patents_in_force People_living_with_HIV Personal_computers_per_100_people
## Min. : 4 Min. : 17 Min. : 0.10
## 1st Qu.: 349 1st Qu.: 5735 1st Qu.: 1.40
## Median : 1324 Median : 18492 Median : 5.50
## Mean : 48481 Mean : 219519 Mean :14.89
## 3rd Qu.: 17852 3rd Qu.: 127032 3rd Qu.:17.52
## Max. :1474028 Max. :5560376 Max. :82.30
## NA's :90 NA's :55 NA's :36
## Personal_computers_total Population_growth
## Min. : 600 Min. :-1.240
## 1st Qu.: 45000 1st Qu.: 0.640
## Median : 281500 Median : 1.350
## Mean : 4573835 Mean : 1.423
## 3rd Qu.: 2143232 3rd Qu.: 2.200
## Max. :224000000 Max. : 4.090
## NA's :36 NA's :15
## Population_in_urban_agglomerations_more_than_1_million Population_total
## Min. : 3.85 Min. :6.536e+04
## 1st Qu.: 12.86 1st Qu.:2.247e+06
## Median : 20.27 Median :7.309e+06
## Mean : 24.45 Mean :3.468e+07
## 3rd Qu.: 31.13 3rd Qu.:2.275e+07
## Max. :103.34 Max. :1.300e+09
## NA's :98 NA's :28
## Poverty_headcount_ratio_at_national_poverty_line Present_value_of_debt
## Min. : 4.60 Mode:logical
## 1st Qu.:21.25 NA's:202
## Median :35.00
## Mean :37.47
## 3rd Qu.:51.45
## Max. :74.90
## NA's :115
## Primary_completion_rate_total Primary_energy_consumption
## Min. : 24.29 Min. : 2.70
## 1st Qu.: 72.03 1st Qu.: 25.22
## Median : 92.88 Median : 54.11
## Mean : 84.31 Mean : 155.05
## 3rd Qu.: 99.37 3rd Qu.: 140.03
## Max. :138.16 Max. :2342.71
## NA's :30 NA's :137
## Primary_energy_consumption_per_person Primary_school_completion_pct_of_boys
## Min. : 0.130 Min. : 24.75
## 1st Qu.: 1.735 1st Qu.: 70.75
## Median : 3.175 Median : 91.00
## Mean : 4.057 Mean : 82.77
## 3rd Qu.: 4.575 3rd Qu.: 98.00
## Max. :22.720 Max. :114.00
## NA's :138 NA's :55
## Primary_school_completion_pct_of_girls Prostate_cancer_deaths_per_100_000_men
## Min. : 18.00 Min. : 0.20
## 1st Qu.: 64.60 1st Qu.: 5.35
## Median : 91.00 Median :11.46
## Mean : 80.27 Mean :12.06
## 3rd Qu.: 98.00 3rd Qu.:16.48
## Max. :114.00 Max. :55.30
## NA's :55 NA's :30
## Prostate_cancer_new_cases_per_100_000_men
## Min. : 0.300
## 1st Qu.: 8.075
## Median : 19.150
## Mean : 26.043
## 3rd Qu.: 36.500
## Max. :124.800
## NA's :30
## Prostate_cancer_number_of_male_deaths Prostate_cancer_number_of_new_male_cases
## Min. : 1 Min. : 1
## 1st Qu.: 83 1st Qu.: 130
## Median : 274 Median : 416
## Mean : 1281 Mean : 3946
## 3rd Qu.: 1074 3rd Qu.: 1970
## Max. :32442 Max. :239930
## NA's :31 NA's :31
## Pump_price_for_gasoline
## Min. :0.0000
## 1st Qu.:0.6600
## Median :0.8800
## Mean :0.9074
## 3rd Qu.:1.2000
## Max. :1.9000
## NA's :33
## Ratio_of_girls_to_boys_in_primary_and_secondary_education
## Min. : 48.12
## 1st Qu.: 93.33
## Median : 99.08
## Mean : 94.80
## 3rd Qu.:101.47
## Max. :112.56
## NA's :21
## Ratio_of_young_literate_females_to_males Roads_paved
## Min. : 36.20 Min. : 0.80
## 1st Qu.: 89.73 1st Qu.: 18.23
## Median : 99.70 Median : 44.45
## Mean : 91.64 Mean : 49.31
## 3rd Qu.:100.11 3rd Qu.: 81.39
## Max. :106.36 Max. :100.00
## NA's :70 NA's :22
## SO2_emissions_per_person Services_contribution_to_economy
## Min. : 0.160 Min. : 2.94
## 1st Qu.: 1.798 1st Qu.:43.97
## Median : 4.950 Median :55.65
## Mean : 11.638 Mean :54.14
## 3rd Qu.: 15.178 3rd Qu.:65.71
## Max. :106.610 Max. :90.63
## NA's :62 NA's :26
## Stomach_cancer_deaths_per_100_000_men Stomach_cancer_deaths_per_100_000_women
## Min. : 0.500 Min. : 0.500
## 1st Qu.: 5.082 1st Qu.: 3.000
## Median : 8.100 Median : 5.070
## Mean :10.653 Mean : 5.799
## 3rd Qu.:15.262 3rd Qu.: 7.612
## Max. :37.100 Max. :24.100
## NA's :30 NA's :30
## Stomach_cancer_new_cases_per_100_000_men
## Min. : 0.600
## 1st Qu.: 6.525
## Median :12.350
## Mean :14.852
## 3rd Qu.:19.800
## Max. :69.700
## NA's :30
## Stomach_cancer_new_cases_per_100_000_women
## Min. : 0.600
## 1st Qu.: 3.675
## Median : 6.400
## Mean : 7.997
## 3rd Qu.: 9.925
## Max. :30.600
## NA's :30
## Stomach_cancer_number_of_female_deaths Stomach_cancer_number_of_male_deaths
## Min. : 3.0 Min. : 2.0
## 1st Qu.: 47.5 1st Qu.: 57.0
## Median : 214.0 Median : 302.0
## Mean : 1477.3 Mean : 2589.9
## 3rd Qu.: 626.5 3rd Qu.: 945.5
## Max. :101719.0 Max. :206632.0
## NA's :31 NA's :31
## Stomach_cancer_number_of_new_female_cases
## Min. : 3
## 1st Qu.: 58
## Median : 252
## Mean : 1921
## 3rd Qu.: 736
## Max. :128478
## NA's :31
## Stomach_cancer_number_of_new_male_cases Sugar_per_person Surface_area
## Min. : 2 Min. : 5.48 Min. : 28
## 1st Qu.: 65 1st Qu.: 41.10 1st Qu.: 27398
## Median : 372 Median : 87.67 Median : 130980
## Mean : 3505 Mean : 82.37 Mean : 709454
## 3rd Qu.: 1132 3rd Qu.:115.07 3rd Qu.: 533852
## Max. :264460 Max. :191.78 Max. :17100000
## NA's :31 NA's :29 NA's :14
## Tax_revenue Total_CO2_emissions Total_income Total_reserves
## Min. : 0.98 Min. : 26 Min. :5.190e+07 Min. : 0.99
## 1st Qu.:12.11 1st Qu.: 1673 1st Qu.:3.318e+09 1st Qu.: 16.29
## Median :16.72 Median : 10212 Median :1.145e+10 Median : 28.52
## Mean :17.24 Mean : 148360 Mean :2.016e+11 Mean : 57.25
## 3rd Qu.:21.65 3rd Qu.: 65492 3rd Qu.:8.680e+10 3rd Qu.: 55.31
## Max. :44.34 Max. :5776432 Max. :1.100e+13 Max. :1334.86
## NA's :65 NA's :16 NA's :24 NA's :74
## Trade_balance_goods_and_services Under_five_mortality_from_CME
## Min. :-7.140e+11 Min. : 2.90
## 1st Qu.:-1.210e+09 1st Qu.: 12.40
## Median :-2.240e+08 Median : 29.98
## Mean : 3.424e+08 Mean : 56.68
## 3rd Qu.: 1.024e+09 3rd Qu.: 88.70
## Max. : 1.390e+11 Max. :267.00
## NA's :31 NA's :21
## Under_five_mortality_from_IHME Under_five_mortality_rate Urban_population
## Min. : 3.000 Min. : 2.90 Min. : 15456
## 1st Qu.: 8.475 1st Qu.: 12.40 1st Qu.: 917162
## Median : 27.600 Median : 29.98 Median : 3427661
## Mean : 54.356 Mean : 56.68 Mean : 16657627
## 3rd Qu.: 82.900 3rd Qu.: 88.70 3rd Qu.: 9837113
## Max. :253.700 Max. :267.00 Max. :527000000
## NA's :32 NA's :21 NA's :14
## Urban_population_growth Urban_population_pct_of_total
## Min. :-1.160 Min. : 10.00
## 1st Qu.: 1.105 1st Qu.: 35.65
## Median : 1.945 Median : 57.30
## Mean : 2.166 Mean : 55.20
## 3rd Qu.: 3.252 3rd Qu.: 72.75
## Max. : 7.850 Max. :100.00
## NA's :14 NA's :14
summary(economic_freedom)
## CountryID Country.Name WEBNAME Region
## Min. : 1.00 Length:188 Length:188 Length:188
## 1st Qu.: 46.75 Class :character Class :character Class :character
## Median : 93.50 Mode :character Mode :character Mode :character
## Mean : 93.59
## 3rd Qu.:140.25
## Max. :186.00
## NA's :4
## World.Rank Region.Rank X2022.Score Property.Rights
## Length:188 Length:188 Length:188 Length:188
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Judical.Effectiveness Government.Integrity Tax.Burden
## Length:188 Length:188 Length:188
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Gov.t.Spending Fiscal.Health Business.Freedom Labor.Freedom
## Length:188 Length:188 Length:188 Length:188
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Monetary.Freedom Trade.Freedom Investment.Freedom. Financial.Freedom
## Length:188 Length:188 Length:188 Length:188
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Tariff.Rate.... X Income.Tax.Rate....
## Length:188 Length:188 Length:188
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Corporate.Tax.Rate.... Tax.Burden...of.GDP X.1
## Length:188 Length:188 Length:188
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Gov.t.Expenditure...of.GDP. Country X.2
## Length:188 Length:188 Length:188
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Population..Millions. GDP..Billions..PPP. GDP.Growth.Rate....
## Min. : 0.038 Length:188 Length:188
## 1st Qu.: 2.575 Class :character Class :character
## Median : 9.800 Mode :character Mode :character
## Mean : 41.972
## 3rd Qu.: 31.150
## Max. :1402.100
## NA's :4
## X5.Year.GDP.Growth.Rate.... GDP.per.Capita..PPP. Unemployment....
## Length:188 Length:188 Length:188
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Inflation.... FDI.Inflow..Millions. Public.Debt....of.GDP.
## Length:188 Length:188 Length:188
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
str(economic_freedom)
## 'data.frame': 188 obs. of 37 variables:
## $ CountryID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Country.Name : chr "Afghanistan" "Albania" "Algeria" "Angola" ...
## $ WEBNAME : chr "Afghanistan" "Albania" "Algeria" "Angola" ...
## $ Region : chr "Asia-Pacific" "Europe" "Middle East and North Africa" "Sub-Saharan Africa" ...
## $ World.Rank : chr "N/A" "50" "167" "139" ...
## $ Region.Rank : chr "N/A" "30" "13" "29" ...
## $ X2022.Score : chr "N/A" "66.6" "45.8" "52.6" ...
## $ Property.Rights : chr "N/A" "55.5" "27.9" "39.8" ...
## $ Judical.Effectiveness : chr "N/A" "49.8" "29.7" "25.3" ...
## $ Government.Integrity : chr "N/A" "35.6" "30.1" "20.6" ...
## $ Tax.Burden : chr "N/A" "89.1" "67.2" "86.6" ...
## $ Gov.t.Spending : chr "N/A" "72.1" "57.1" "86.4" ...
## $ Fiscal.Health : chr "N/A" "70.6" "38.6" "80.0" ...
## $ Business.Freedom : chr "N/A" "70.7" "50.0" "37.6" ...
## $ Labor.Freedom : chr "N/A" "51.1" "51.5" "53.9" ...
## $ Monetary.Freedom : chr "N/A" "82.0" "80.1" "61.2" ...
## $ Trade.Freedom : chr "N/A" "82.6" "57.4" "70.0" ...
## $ Investment.Freedom. : chr "N/A" "70" "30" "30" ...
## $ Financial.Freedom : chr "N/A" "70" "30" "40" ...
## $ Tariff.Rate.... : chr "N/A" "3.7" "13.8" "7.5" ...
## $ X : chr "Afghanistan" "Albania" "Algeria" "Angola" ...
## $ Income.Tax.Rate.... : chr "20.0" "23.0" "35.0" "25.0" ...
## $ Corporate.Tax.Rate.... : chr "20.0" "15.0" "26.0" "25.0" ...
## $ Tax.Burden...of.GDP : chr "7.5" "18.3" "37.2" "9.4" ...
## $ X.1 : chr "Afganistan " "Albania" "Algeria" "Angola" ...
## $ Gov.t.Expenditure...of.GDP.: chr "N/A" "30.5" "37.8" "21.3" ...
## $ Country : chr "Afghanistan" "Albania" "Algeria" "Angola" ...
## $ X.2 : chr "Afghanistan" "Albania" "Algeria" "Angola" ...
## $ Population..Millions. : num 38.9 2.8 43.9 32.9 45.4 3 25.7 8.9 10.1 0.4 ...
## $ GDP..Billions..PPP. : chr "$78.7 " "$40.7 " "$491.5 " "$215.1 " ...
## $ GDP.Growth.Rate.... : chr "-5.0" "-3.5" "-6.0" "-4.0" ...
## $ X5.Year.GDP.Growth.Rate....: chr "1.0" "2.0" "0.1" "-1.9" ...
## $ GDP.per.Capita..PPP. : chr "$2,390" "$14,218" "$11,112" "$6,932" ...
## $ Unemployment.... : chr "11.7" "11.7" "12.8" "7.7" ...
## $ Inflation.... : chr "5.6" "1.6" "2.4" "22.3" ...
## $ FDI.Inflow..Millions. : chr "13.0" "1,107.0" "1,125.0" "-1,866.0" ...
## $ Public.Debt....of.GDP. : chr "7.8" "76.0" "53.1" "127.1" ...
str(who)
## 'data.frame': 202 obs. of 358 variables:
## $ Country : chr "Afghanistan" "Albania" "Algeria" "Andorra" ...
## $ CountryID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Continent : int 1 2 3 2 3 4 5 2 6 2 ...
## $ Adolescent.fertility.rate.... : int 151 27 6 NA 146 NA 62 30 16 14 ...
## $ Adult.literacy.rate.... : num 28 98.7 69.9 NA 67.4 NA 97.2 99.4 NA NA ...
## $ Gross.national.income.per.capita..PPP.international... : int NA 6000 5940 NA 3890 15130 11670 4950 33940 36040 ...
## $ Net.primary.school.enrolment.ratio.female.... : int NA 93 94 83 49 NA 98 84 97 98 ...
## $ Net.primary.school.enrolment.ratio.male.... : int NA 94 96 83 51 NA 99 80 96 97 ...
## $ Population..in.thousands..total : int 26088 3172 33351 74 16557 84 39134 3010 20530 8327 ...
## $ Population.annual.growth.rate.... : num 4 0.6 1.5 1 2.8 1.3 1 -0.3 1.1 0.4 ...
## $ Population.in.urban.areas.... : int 23 46 64 93 54 37 90 64 88 66 ...
## $ Population.living.below.the.poverty.line....living.on..lt..US.1.per.day. : num NA 2 NA NA NA NA 6.6 2 NA NA ...
## $ Population.median.age..years. : int 16 29 24 NA 17 NA 29 32 37 40 ...
## $ Population.proportion.over.60.... : int 4 13 7 22 4 11 14 14 18 22 ...
## $ Population.proportion.under.15.... : int 47 26 29 14 46 28 26 20 19 16 ...
## $ Registration.coverage.of.births.... : int 6 90 90 90 29 NA 90 90 90 90 ...
## $ Total.fertility.rate..per.woman. : num 7.2 2.1 2.4 1.3 6.5 2.2 2.3 1.3 1.8 1.4 ...
## $ Antenatal.care.coverage...at.least.four.visits.... : int NA NA 41 NA NA NA NA 71 NA NA ...
## $ Antiretroviral.therapy.coverage.among.HIV.infected.pregt.women.for.PMTCT.... : int NA NA NA NA 14 NA NA NA NA NA ...
## $ Antiretroviral.therapy.coverage.among.people.with.advanced.HIV.infections.... : int NA NA 14 NA 16 NA 71 8 NA NA ...
## $ Births.attended.by.skilled.health.personnel.... : int 14 100 95 NA 45 100 99 98 100 NA ...
## $ Births.by.caesarean.section.... : int NA 15 6 NA NA NA NA 9 NA 21 ...
## $ Children.aged.6.59.months.who.received.vitamin.A.supplementation.... : num NA NA NA NA NA NA NA NA NA NA ...
## $ Children.aged..lt.5.years.sleeping.under.insecticide.treated.nets.... : num NA NA NA NA NA NA NA NA NA NA ...
## $ Children.aged..lt.5.years.who.received.any.antimalarial.treatment.for.fever.... : num NA NA NA NA NA NA NA NA NA NA ...
## $ Children.aged..lt.5.years.with.ARI.symptoms.taken.to.facility.... : num NA NA NA NA NA NA NA 31.9 NA NA ...
## $ Children.aged..lt.5.years.with.diarrhoea.receiving.ORT.... : num NA NA NA NA NA NA NA 65.3 NA NA ...
## $ Contraceptive.prevalence.... : num 10.3 75.1 61.4 NA 6.2 NA 65.3 53.1 NA NA ...
## $ Neonates.protected.at.birth.against.neonatal.tetanus..PAB..... : int 73 87 70 NA 81 NA NA NA NA NA ...
## $ One.year.olds.immunized.with.MCV : int 70 97 92 94 88 99 99 92 94 79 ...
## $ One.year.olds.immunized.with.three.doses.of.diphtheria.tetanus.toxoid.and.pertussis..DTP3..... : int 83 98 95 96 83 99 96 88 92 85 ...
## $ One.year.olds.immunized.with.three.doses.of.Hepatitis.B..HepB3..... : int 83 98 90 91 83 97 92 85 94 85 ...
## $ One.year.olds.immunized.with.three.doses.of.Hib..Hib3..vaccine.... : int NA NA NA 95 83 99 96 NA 94 85 ...
## $ Tuberculosis.detection.rate.under.DOTS.... : int 66 37 102 125 76 284 71 59 40 46 ...
## $ Tuberculosis.treatment.success.under.DOTS.... : int 90 77 87 80 72 100 53 72 80 75 ...
## $ Women.who.have.had.mammography.... : int NA NA NA NA NA NA NA NA 57 76 ...
## $ Women.who.have.had.PAP.smear.... : int NA NA NA NA NA NA NA NA 61 83 ...
## $ Community.and.traditional.health.workers.density..per.10.000.population. : int NA NA NA NA NA NA NA NA 2 NA ...
## $ Dentistry.personnel.density..per.10.000.population. : int NA 3 3 7 NA 2 8 4 11 5 ...
## $ Environment.and.public.health.workers.density..per.10.000.population. : int NA NA NA NA NA NA NA NA NA NA ...
## $ External.resources.for.health.as.percentage.of.total.expenditure.on.health : num 20.1 3.7 0.1 0 7 0.2 0.1 14.5 0 0 ...
## $ General.government.expenditure.on.health.as.percentage.of.total.expenditure.on.health : num 27.5 35.5 77.3 70.6 86.6 67.3 45.5 41.2 67.2 77 ...
## $ General.government.expenditure.on.health.as.percentage.of.total.government.expenditure : num 4.4 11.3 9.5 22.7 5 11.3 14.2 9.7 17.2 15.5 ...
## $ Hospital.beds..per.10.000.population. : int 4 30 17 26 1 24 41 44 40 76 ...
## $ Laboratory.health.workers.density..per.10.000.population. : int NA NA 3 NA 1 NA NA NA 4 NA ...
## $ Number.of.community.and.traditional.health.workers : int NA NA 1062 NA NA NA NA NA 3812 NA ...
## $ Number.of.dentistry.personnel : int 900 1035 9553 46 222 13 28900 1255 21296 4467 ...
## $ Number.of.environment.and.public.health.workers : int NA NA 2534 NA NA NA NA NA NA NA ...
## $ Number.of.laboratory.health.workers : int NA NA 8838 NA 2029 NA NA NA 8326 NA ...
## $ Number.of.nursing.and.midwifery.personnel : int 14930 14637 69749 259 18977 233 29000 14806 187837 53782 ...
## $ Number.of.other.health.service.providers : int NA NA 6716 NA 254 NA NA NA 42151 NA ...
## $ Number.of.pharmaceutical.personnel : int 900 1173 6333 72 919 NA 15300 157 13956 5076 ...
## $ Number.of.physicians : int 5970 3626 35368 244 1165 12 108800 11133 47875 30068 ...
## $ Nursing.and.midwifery.personnel.density..per.10.000.population. : int 5 47 22 39 14 33 8 49 97 66 ...
## $ Other.health.service.providers.density..per.10.000.population. : int NA NA 2 NA NA NA NA NA 22 NA ...
## $ Out.of.pocket.expenditure.as.percentage.of.private.expenditure.on.health : num 97.2 94.7 94.6 73.2 100 86.9 43.8 87.6 55.7 72.2 ...
## $ Per.capita.government.expenditure.on.health..PPP.int.... : int 8 127 146 2054 61 439 758 112 2097 2729 ...
## $ Per.capita.government.expenditure.on.health.at.average.exchange.rate..US.. : int 6 62 95 1987 62 348 251 41 2227 2975 ...
## $ Per.capita.total.expenditure.on.health..PPP.int.... : int 29 358 188 2910 71 652 1665 272 3122 3545 ...
## $ Per.capita.total.expenditure.on.health.at.average.exchange.rate..US.. : int 23 174 123 2815 71 517 551 99 3316 3864 ...
## $ Pharmaceutical.personnel.density..per.10.000.population. : int NA 4 2 11 NA NA 4 NA 7 6 ...
## $ Physicians.density..per.10.000.population. : num 2 12 11 36 NA 2 30 37 25 37 ...
## $ Private.expenditure.on.health.as.percentage.of.total.expenditure.on.health : num 72.5 64.5 22.7 29.4 13.4 32.7 54.5 58.8 32.8 23 ...
## $ Private.prepaid.plans.as.percentage.of.private.expenditure.on.health : num 0 0 5.2 24.6 0 13.1 51.1 0.1 22 23 ...
## $ Ratio.of.health.management.and.support.workers.to.health.service.providers : num NA NA 0.4 NA 0.01 NA NA NA 1.5 NA ...
## $ Ratio.of.nurses.and.midwives.to.physicians : num 2.5 4 2 1.1 16.9 19.3 0.3 1.3 3.9 1.8 ...
## $ Social.security.expenditure.on.health.as.percentage.of.general.government.expenditure.on.health : num 0 32.8 33.3 87.7 0 0 58.5 0 0 61 ...
## $ Total.expenditure.on.health.as.percentage.of.gross.domestic.product : num 5.4 6.2 3.6 6.3 2.7 4.9 10.1 4.7 8.7 9.9 ...
## $ Births.attended.by.skilled.health.personnel.....highest.educational.level.of.mother : num NA NA NA NA NA NA NA 97.6 NA NA ...
## $ Births.attended.by.skilled.health.personnel.....highest.wealth.quintile : num NA NA NA NA NA NA NA 100 NA NA ...
## $ Births.attended.by.skilled.health.personnel.....lowest.educational.level.of.mother : num NA NA NA NA NA NA NA 97.4 NA NA ...
## $ Births.attended.by.skilled.health.personnel.....lowest.wealth.quintile : num NA NA NA NA NA NA NA 92.8 NA NA ...
## $ Births.attended.by.skilled.health.personnel.....rural : num NA NA NA NA NA NA NA 98 NA NA ...
## $ Births.attended.by.skilled.health.personnel.....urban : num NA NA NA NA NA NA NA 98.6 NA NA ...
## $ Births.attended.by.skilled.health.personnel.difference.highest.lowest.educational.level.of.mother : num NA NA NA NA NA NA NA 0.2 NA NA ...
## $ Births.attended.by.skilled.health.personnel.difference.highest.lowest.wealth.quintile : num NA NA NA NA NA NA NA 7.2 NA NA ...
## $ Births.attended.by.skilled.health.personnel.difference.urban.rural : num NA NA NA NA NA NA NA 0.6 NA NA ...
## $ Births.attended.by.skilled.health.personnel.ratio.highest.lowest.educational.level.of.mother : num NA NA NA NA NA NA NA 1 NA NA ...
## $ Births.attended.by.skilled.health.personnel.ratio.highest.lowest.wealth.quintile : num NA NA NA NA NA NA NA 1.1 NA NA ...
## $ Births.attended.by.skilled.health.personnel.ratio.urban.rural : num NA NA NA NA NA NA NA 1 NA NA ...
## $ Measles.immunization.coverage.among.one.year.olds.....highest.educational.level.of.mother : num NA NA NA NA NA NA NA 79.4 NA NA ...
## $ Measles.immunization.coverage.among.one.year.olds.....highest.wealth.quintile : num NA NA NA NA NA NA NA 60.7 NA NA ...
## $ Measles.immunization.coverage.among.one.year.olds.....lowest.educational.level.of.mother : num NA NA NA NA NA NA NA 70.8 NA NA ...
## $ Measles.immunization.coverage.among.one.year.olds.....lowest.wealth.quintile : num NA NA NA NA NA NA NA 71.6 NA NA ...
## $ Measles.immunization.coverage.among.one.year.olds.....rural : num NA NA NA NA NA NA NA 80.4 NA NA ...
## $ Measles.immunization.coverage.among.one.year.olds.....urban : num NA NA NA NA NA NA NA 67 NA NA ...
## $ Measles.immunization.coverage.among.one.year.olds.difference.highest.lowest.educational.level.of.mother : num NA NA NA NA NA NA NA 8.6 NA NA ...
## $ Measles.immunization.coverage.among.one.year.olds.difference.highest.lowest.wealth.quintile : num NA NA NA NA NA NA NA -10.9 NA NA ...
## $ Measles.immunization.coverage.among.one.year.olds.difference.urban.rural : num NA NA NA NA NA NA NA -13.4 NA NA ...
## $ Measles.immunization.coverage.among.one.year.olds.ratio.highest.lowest.educational.level.of.mother : num NA NA NA NA NA NA NA 1.1 NA NA ...
## $ Measles.immunization.coverage.among.one.year.olds.ratio.highest.lowest.wealth.quintile : num NA NA NA NA NA NA NA 0.8 NA NA ...
## $ Measles.immunization.coverage.among.one.year.olds.ratio.urban.rural : num NA NA NA NA NA NA NA 0.8 NA NA ...
## $ Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..difference.lowest.highest.educational.level.of.mother: num NA NA NA NA NA NA NA 6 NA NA ...
## $ Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..difference.lowest.highest.wealth.quintile : num NA NA NA NA NA NA NA 29 NA NA ...
## $ Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..difference.rural.urban : num NA NA NA NA NA NA NA 16 NA NA ...
## $ Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..highest.educational.level.of.mother : num NA NA NA NA NA NA NA 27 NA NA ...
## $ Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..highest.wealth.quintile : num NA NA NA NA NA NA NA 23 NA NA ...
## $ Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..lowest.educational.level.of.mother : num NA NA NA NA NA NA NA 33 NA NA ...
## $ Under.5.mortality.rate..Probability.of.dying.aged..lt..5.years.per.1.000.live.births..lowest.wealth.quintile : num NA NA NA NA NA NA NA 52 NA NA ...
## [list output truncated]
Economic freedom dataset has some interesnting indicators apart from the classical ones of GDP, GNI… In general, we can use some “social” and health variables of the who dataset to meassure the quality of life and social freedom, and the other dataset to meassure the Economic freedom of each country. Therefore, if we merge both datasets we can see which are the freer countries in all aspects (and if they are also the ones with a better quality of life).
To merge the datasets, we need a common variable. In our case it is the name of the country. But as I get the datasets from different sources, we need to check if the country names are written in the same way
# Are the countries written in the same way?
for(j in 1:nrow(economic_freedom)){
if(!any(economic_freedom[j,27] == who[,1])){
cat("The row", j, "is written different or does not appear in the WHO dataset:", economic_freedom[j,27], "\n")
}
}
## The row 26 is written different or does not appear in the WHO dataset: Burma
## The row 31 is written different or does not appear in the WHO dataset: Cabo Verde
## The row 38 is written different or does not appear in the WHO dataset: Congo, Democratic Republic of the Congo
## The row 39 is written different or does not appear in the WHO dataset: Congo, Republic of
## The row 41 is written different or does not appear in the WHO dataset: Côte d'Ivoire
## The row 56 is written different or does not appear in the WHO dataset: Eswatini
## The row 77 is written different or does not appear in the WHO dataset: Iran
## The row 88 is written different or does not appear in the WHO dataset: Korea, North
## The row 89 is written different or does not appear in the WHO dataset: Korea, South
## The row 90 is written different or does not appear in the WHO dataset: Kosovo
## The row 92 is written different or does not appear in the WHO dataset: Kyrgyz Republic
## The row 93 is written different or does not appear in the WHO dataset: Lao P.D.R.
## The row 98 is written different or does not appear in the WHO dataset: Libya
## The row 99 is written different or does not appear in the WHO dataset: Liechtenstein
## The row 112 is written different or does not appear in the WHO dataset: Micronesia
## The row 142 is written different or does not appear in the WHO dataset: São Tomé and Príncipe
## The row 149 is written different or does not appear in the WHO dataset: Slovak Republic
## The row 161 is written different or does not appear in the WHO dataset: Taiwan
## The row 176 is written different or does not appear in the WHO dataset: United States
## The row 185 is written different or does not appear in the WHO dataset:
## The row 186 is written different or does not appear in the WHO dataset:
## The row 187 is written different or does not appear in the WHO dataset:
## The row 188 is written different or does not appear in the WHO dataset:
# We can see that there are about 20 countries that appear in both
# datasets but with different names. For example, we have "United States"
# on the economic_freedom dataset, and "United States of America"
# on the other.
# Let's change the names so they appear in the same way in both data sets.
who$Country[which(who$Country == 'Cape Verde')] = 'Cabo Verde'
economic_freedom$Country[which(economic_freedom$Country == "Congo, Democratic Republic of the Congo")]=
"Congo, Dem. Rep."
economic_freedom$Country[which(economic_freedom$Country == "Congo, Republic of")]=
"Congo, Rep."
economic_freedom$Country[which(economic_freedom$Country == "Côte d'Ivoire")]=
"Cote d'Ivoire"
who$Country[which(who$Country == "Iran (Islamic Republic of)")]=
"Iran"
who$Country[which(who$Country == 'Korea, Dem. Rep.')] = "Korea, North "
who$Country[which(who$Country == 'Korea, Rep.')] = 'Korea, South'
economic_freedom$Country[which(economic_freedom$Country == "Kyrgyz Republic")]=
"Kyrgyzstan"
who$Country[which(who$Country == "Lao People's Democratic Republic")] = "Lao P.D.R."
who$Country[which(who$Country == "Libyan Arab Jamahiriya")] = 'Libya'
economic_freedom$Country[which(economic_freedom$Country == "North Macedonia")]=
"Macedonia"
who$Country[which(who$Country == "Micronesia (Federated States of)")] = 'Micronesia'
economic_freedom$Country[which(economic_freedom$Country == "São Tomé and PrÃncipe")]=
"Sao Tome and Principe"
economic_freedom$Country[which(economic_freedom$Country == "Slovak Republic")]=
"Slovakia"
economic_freedom$Country[which(economic_freedom$Country == "Taiwan ")]=
"Taiwan"
who$Country[which(who$Country == "United States of America")] = 'United States'
economic_freedom$Country[which(economic_freedom$Country == "Burma")]=
"Myanmar"
who$Country[which(who$Country == "Swaziland")] = 'Eswatini'
# If we check again the countries in common, we now see that all the countries that appear on the economic_freedom dataset are also in the other one, except from just Kosovo and Liechtenstein.
for(j in 1:nrow(economic_freedom)){
if(!any(economic_freedom[j,27] == who[,1])){
cat("The row", j, "is not in both dataset:", economic_freedom[j,27], "\n")
}
}
## The row 41 is not in both dataset: Côte d'Ivoire
## The row 90 is not in both dataset: Kosovo
## The row 99 is not in both dataset: Liechtenstein
## The row 142 is not in both dataset: São Tomé and Príncipe
## The row 185 is not in both dataset:
## The row 186 is not in both dataset:
## The row 187 is not in both dataset:
## The row 188 is not in both dataset:
# There is also something strange with the lines from 185 to 188 of
# the economic dataset. It seems that they don't have any value
# on the variable Country (the name of the country). Let's have a look.
for(i in 1:nrow(economic_freedom)){
if(is.na(economic_freedom[i,1])){
cat("Row ",i," has NA values\n")
}
}
## Row 185 has NA values
## Row 186 has NA values
## Row 187 has NA values
## Row 188 has NA values
md.pattern(economic_freedom)
## Country.Name WEBNAME Region World.Rank Region.Rank X2022.Score
## 183 1 1 1 1 1 1
## 4 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 0 0 0 0 0 0
## Property.Rights Judical.Effectiveness Government.Integrity Tax.Burden
## 183 1 1 1 1
## 4 1 1 1 1
## 1 1 1 1 1
## 0 0 0 0
## Gov.t.Spending Fiscal.Health Business.Freedom Labor.Freedom
## 183 1 1 1 1
## 4 1 1 1 1
## 1 1 1 1 1
## 0 0 0 0
## Monetary.Freedom Trade.Freedom Investment.Freedom. Financial.Freedom
## 183 1 1 1 1
## 4 1 1 1 1
## 1 1 1 1 1
## 0 0 0 0
## Tariff.Rate.... X Income.Tax.Rate.... Corporate.Tax.Rate....
## 183 1 1 1 1
## 4 1 1 1 1
## 1 1 1 1 1
## 0 0 0 0
## Tax.Burden...of.GDP X.1 Gov.t.Expenditure...of.GDP. Country X.2
## 183 1 1 1 1 1
## 4 1 1 1 1 1
## 1 1 1 1 1 1
## 0 0 0 0 0
## GDP..Billions..PPP. GDP.Growth.Rate.... X5.Year.GDP.Growth.Rate....
## 183 1 1 1
## 4 1 1 1
## 1 1 1 1
## 0 0 0
## GDP.per.Capita..PPP. Unemployment.... Inflation.... Public.Debt....of.GDP.
## 183 1 1 1 1
## 4 1 1 1 1
## 1 1 1 1 1
## 0 0 0 0
## FDI.Inflow..Millions. CountryID Population..Millions.
## 183 1 1 1 0
## 4 1 0 0 2
## 1 0 1 1 1
## 1 4 4 9
# After doing this, we know that the last 4 rows of the dataset are empty.
# We can remove them, but as we are going to do a merge now, we are going
# get rid of them anyway.
# Now, we can finally do the merge.
# We have lost some countries of the who dataset, since not all the
# countries that appear there were also in the economic_freedom dataset,
# which is normal.
total.data= merge(x = who, y = economic_freedom, by = 'Country')
Let’s select some relevant variables. Later we will see which are the more important ones.
data =
total.data %>% select(Country,Total.fertility.rate..per.woman.,
Gross.national.income.per.capita..PPP.international...,
Population.living.below.the.poverty.line....living.on..lt..US.1.per.day.,
Adult.mortality.rate..probability.of.dying.between.15.to.60.years.per.1000.population..both.sexes,
Infant.mortality.rate..per.1.000.live.births..both.sexes,
Life.expectancy.at.birth..years..both.sexes,
Under.5.mortality.rate..probability.of.dying.by.age.5.per.1000.live.births..both.sexes,
CO2_emissions, Cell_phones_per_100_people,
Income_per_person, Inflation...., Business.Freedom,
Internet_users, Democracy_score,
Judical.Effectiveness, X2022.Score, Property.Rights,
Unemployment...., GDP.per.Capita..PPP.)
New variable I also think that it would be interesting to have a variable that was representative of the ‘equality’ in the country. Since there is not variable that measures exactly that, I think a good approximation is to check of the percentage of women and men enrolled in primary school is similar. Countries that are more advanced in equality tend to have an enrollment ratio in primary school similar in both sexes. Hence, I am going to create a new variable called ‘education_equality’ that shows this information:
data$Education_equality = total.data$Net.primary.school.enrolment.ratio.female..../total.data$Net.primary.school.enrolment.ratio.male....
# FIND the NAs
# How many NAs we have?
sum(is.na(data))
## [1] 182
# With the function summary we can see more in detail in which variable
# the NAs are.
summary(data)
## Country Total.fertility.rate..per.woman.
## Length:180 Min. :1.200
## Class :character 1st Qu.:1.800
## Mode :character Median :2.500
## Mean :3.026
## 3rd Qu.:3.950
## Max. :7.300
## NA's :1
## Gross.national.income.per.capita..PPP.international...
## Min. : 260
## 1st Qu.: 2135
## Median : 6110
## Mean :11349
## 3rd Qu.:14695
## Max. :60870
## NA's :9
## Population.living.below.the.poverty.line....living.on..lt..US.1.per.day.
## Min. : 2.00
## 1st Qu.: 2.00
## Median : 7.40
## Mean :16.04
## 3rd Qu.:23.10
## Max. :70.80
## NA's :109
## Adult.mortality.rate..probability.of.dying.between.15.to.60.years.per.1000.population..both.sexes
## Min. : 58
## 1st Qu.:120
## Median :186
## Mean :224
## 3rd Qu.:284
## Max. :751
## NA's :1
## Infant.mortality.rate..per.1.000.live.births..both.sexes
## Min. : 2.00
## 1st Qu.: 9.00
## Median : 23.00
## Mean : 38.93
## 3rd Qu.: 60.00
## Max. :165.00
## NA's :1
## Life.expectancy.at.birth..years..both.sexes
## Min. :40.00
## 1st Qu.:61.00
## Median :70.00
## Mean :67.09
## 3rd Qu.:75.00
## Max. :83.00
## NA's :1
## Under.5.mortality.rate..probability.of.dying.by.age.5.per.1000.live.births..both.sexes
## Min. : 3.0
## 1st Qu.: 10.0
## Median : 27.0
## Mean : 56.4
## 3rd Qu.: 81.0
## Max. :269.0
## NA's :1
## CO2_emissions Cell_phones_per_100_people Income_per_person
## Min. : 0.0100 Min. : 0.300 Min. : 264
## 1st Qu.: 0.6275 1st Qu.: 8.575 1st Qu.: 1946
## Median : 2.1850 Median : 35.650 Median : 5493
## Mean : 5.0928 Mean : 43.282 Mean :11179
## 3rd Qu.: 7.0925 3rd Qu.: 75.275 3rd Qu.:14490
## Max. :57.7200 Max. :154.800 Max. :70014
## NA's :6 NA's :6 NA's :4
## Inflation.... Business.Freedom Internet_users Democracy_score
## Length:180 Length:180 Min. : 0.00 Min. :-10.00
## Class :character Class :character 1st Qu.: 2.60 1st Qu.: -3.00
## Mode :character Mode :character Median : 8.20 Median : 6.00
## Mean :17.32 Mean : 3.34
## 3rd Qu.:26.07 3rd Qu.: 9.00
## Max. :76.20 Max. : 10.00
## NA's :6 NA's :27
## Judical.Effectiveness X2022.Score Property.Rights Unemployment....
## Length:180 Length:180 Length:180 Length:180
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## GDP.per.Capita..PPP. Education_equality
## Length:180 Min. :0.5455
## Class :character 1st Qu.:0.9673
## Mode :character Median :1.0000
## Mean :0.9715
## 3rd Qu.:1.0104
## Max. :1.0989
## NA's :10
# There are also some NAs that appear as characters ("n/a").
# So let's change them by NA, so we can deal with all of them easier.
data[data == "n/a" |data == "N/A" ] <- NA
# Check that we have replace all of them:
which(data=="n/a" | data== "N/A")
## integer(0)
Casting variables
data$X2022.Score= as.numeric(data$X2022.Score)
data$Property.Rights= as.numeric(data$Property.Rights)
data$Judical.Effectiveness= as.numeric(data$Judical.Effectiveness)
data$Inflation....=as.numeric(data$Inflation....)
data$Unemployment....= as.numeric(data$Unemployment....)
data$Business.Freedom = as.numeric(data$Business.Freedom)
How can we cast the numbers that can with a dollar?
# The variable of the GDP has the problem that comes with the dollar sign ($),so we need to remove it before we cast it to numeric
for(i in 1:nrow(data)){
data$GDP.per.Capita..PPP.[i] =
substr(data$GDP.per.Capita..PPP.[i], 2,nchar(data$GDP.per.Capita..PPP.[i]))
}
data$GDP.per.Capita..PPP. =as.numeric(gsub(",", "", data$GDP.per.Capita..PPP.))
# Gsub is for R to know that the decimal value is the comma.
Let’s change the name of the variables so they are shorter:
# Change the name of the variables so they are shorter
colnames(data) <- c('Country', 'Fertility', 'GNI_Capita_PPP', 'Poverty','Adult_Mortality', 'Infant_Mortality',
'Life_Expentancy', 'Under5_mortality', 'CO2', 'Cell_phones',
'Income_per_person', 'Inflation', 'Business_Freedom',
"Internet_users","Democracy","Judical_Effectiveness","Economic_freedom",
"Property_Rights", "Unemployment", 'GDP_capita_PPP', 'Education_Equality')
# How many NAs we have?
sum(is.na(data))
## [1] 215
length(which(is.na(data))) # Another option
## [1] 215
# Now we are going to see more in detail how the missing values are
# distributed in our data set:
# How many rows contain missing values?
length(which(!complete.cases(data)))
## [1] 110
summary(aggr(data))
##
## Missings per variable:
## Variable Count
## Country 0
## Fertility 1
## GNI_Capita_PPP 9
## Poverty 109
## Adult_Mortality 1
## Infant_Mortality 1
## Life_Expentancy 1
## Under5_mortality 1
## CO2 6
## Cell_phones 6
## Income_per_person 4
## Inflation 4
## Business_Freedom 6
## Internet_users 6
## Democracy 27
## Judical_Effectiveness 6
## Economic_freedom 6
## Property_Rights 6
## Unemployment 3
## GDP_capita_PPP 2
## Education_Equality 10
##
## Missings in combinations of variables:
## Combinations Count Percent
## 0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0 70 38.8888889
## 0:0:0:0:0:0:0:0:0:0:0:0:0:0:1:0:0:0:0:0:0 1 0.5555556
## 0:0:0:1:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0 67 37.2222222
## 0:0:0:1:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:1 3 1.6666667
## 0:0:0:1:0:0:0:0:0:0:0:0:0:0:1:0:0:0:0:0:0 17 9.4444444
## 0:0:0:1:0:0:0:0:0:0:0:0:0:0:1:0:0:0:0:0:1 1 0.5555556
## 0:0:0:1:0:0:0:0:0:0:0:0:0:0:1:0:0:0:1:0:0 3 1.6666667
## 0:0:0:1:0:0:0:0:0:0:0:0:1:0:0:1:1:1:0:0:0 2 1.1111111
## 0:0:0:1:0:0:0:0:0:0:0:1:1:0:0:1:1:1:0:1:0 1 0.5555556
## 0:0:0:1:0:0:0:0:1:0:0:0:0:0:0:0:0:0:0:0:0 2 1.1111111
## 0:0:0:1:0:0:0:0:1:1:1:0:0:1:1:0:0:0:0:0:0 1 0.5555556
## 0:0:0:1:0:0:0:0:1:1:1:0:0:1:1:0:0:0:0:0:1 3 1.6666667
## 0:0:1:1:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0 2 1.1111111
## 0:0:1:1:0:0:0:0:0:0:0:0:0:0:1:0:0:0:0:0:0 1 0.5555556
## 0:0:1:1:0:0:0:0:0:0:0:0:1:0:0:1:1:1:0:0:0 1 0.5555556
## 0:0:1:1:0:0:0:0:0:0:0:0:1:0:0:1:1:1:0:0:1 1 0.5555556
## 0:0:1:1:0:0:0:0:0:0:0:1:0:0:0:0:0:0:0:0:0 1 0.5555556
## 0:0:1:1:0:0:0:0:0:0:0:1:1:0:0:1:1:1:0:0:0 1 0.5555556
## 0:0:1:1:0:0:0:0:0:1:0:1:0:1:0:0:0:0:0:1:1 1 0.5555556
## 0:1:1:1:1:1:1:1:0:1:0:0:0:1:0:0:0:0:0:0:1 1 0.5555556
# The plot on the right part give us the following information (that it is also showed in detail on the console):
# The blue row (on the bottom) shows that there are some rows that contain no NA (the 39% of our rows indeed)
# The second row from the bottom also shows us that there are also some rows with no NA except in the variable Poverty (that we have already removed), which represents the 38 % of our data.
# Another significant thing that we can conclude from the top row of the graph is that there is a row were many variables have a missing value.
# We see that the variable poverty has NA in more than half of the rows, so we are going to remove that variable.
data$Poverty=NULL
Another way to visualize the NAs
missmap(data, main = "Missing Values", col = c("pink", "snow2"))
Remove rows with many NAs
# To know the number of NA per row or column:
colSums(is.na(data))
## Country Fertility GNI_Capita_PPP
## 0 1 9
## Adult_Mortality Infant_Mortality Life_Expentancy
## 1 1 1
## Under5_mortality CO2 Cell_phones
## 1 6 6
## Income_per_person Inflation Business_Freedom
## 4 4 6
## Internet_users Democracy Judical_Effectiveness
## 6 27 6
## Economic_freedom Property_Rights Unemployment
## 6 6 3
## GDP_capita_PPP Education_Equality
## 2 10
rowSums(is.na(data))
## [1] 6 0 0 0 0 0 0 1 0 2 0 0 1 0 0 1 0 0 0 2 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## [38] 1 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## [75] 0 5 0 0 0 0 0 0 0 0 2 6 0 0 0 0 0 0 1 0 4 0 1 1 0 0 0 1 0 1 0 0 0 6 0 0 6
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 6 2 0 0 0 1 0 6
## [149] 0 0 0 0 1 0 1 6 9 0 0 0 5 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 4 0 1
#Let's remove every row with more than 6 missing values.
vec <-rowSums(is.na(data))
data <- data[-which(vec>6), ]
# If we have a look at our data again, it looks as follows:
summary(aggr(data))
##
## Missings per variable:
## Variable Count
## Country 0
## Fertility 0
## GNI_Capita_PPP 8
## Adult_Mortality 0
## Infant_Mortality 0
## Life_Expentancy 0
## Under5_mortality 0
## CO2 6
## Cell_phones 5
## Income_per_person 4
## Inflation 4
## Business_Freedom 6
## Internet_users 5
## Democracy 27
## Judical_Effectiveness 6
## Economic_freedom 6
## Property_Rights 6
## Unemployment 3
## GDP_capita_PPP 2
## Education_Equality 9
##
## Missings in combinations of variables:
## Combinations Count Percent
## 0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0 137 76.5363128
## 0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:1 3 1.6759777
## 0:0:0:0:0:0:0:0:0:0:0:0:0:1:0:0:0:0:0:0 18 10.0558659
## 0:0:0:0:0:0:0:0:0:0:0:0:0:1:0:0:0:0:0:1 1 0.5586592
## 0:0:0:0:0:0:0:0:0:0:0:0:0:1:0:0:0:1:0:0 3 1.6759777
## 0:0:0:0:0:0:0:0:0:0:0:1:0:0:1:1:1:0:0:0 2 1.1173184
## 0:0:0:0:0:0:0:0:0:0:1:1:0:0:1:1:1:0:1:0 1 0.5586592
## 0:0:0:0:0:0:0:1:0:0:0:0:0:0:0:0:0:0:0:0 2 1.1173184
## 0:0:0:0:0:0:0:1:1:1:0:0:1:1:0:0:0:0:0:0 1 0.5586592
## 0:0:0:0:0:0:0:1:1:1:0:0:1:1:0:0:0:0:0:1 3 1.6759777
## 0:0:1:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0 2 1.1173184
## 0:0:1:0:0:0:0:0:0:0:0:0:0:1:0:0:0:0:0:0 1 0.5586592
## 0:0:1:0:0:0:0:0:0:0:0:1:0:0:1:1:1:0:0:0 1 0.5586592
## 0:0:1:0:0:0:0:0:0:0:0:1:0:0:1:1:1:0:0:1 1 0.5586592
## 0:0:1:0:0:0:0:0:0:0:1:0:0:0:0:0:0:0:0:0 1 0.5586592
## 0:0:1:0:0:0:0:0:0:0:1:1:0:0:1:1:1:0:0:0 1 0.5586592
## 0:0:1:0:0:0:0:0:1:0:1:0:1:0:0:0:0:0:1:1 1 0.5586592
# To do this we are going to use Multiple Imputation:
set.seed(123)
md.pattern(data)
## Country Fertility Adult_Mortality Infant_Mortality Life_Expentancy
## 137 1 1 1 1 1
## 18 1 1 1 1 1
## 3 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 3 1 1 1 1 1
## 3 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 0 0 0 0 0
## Under5_mortality GDP_capita_PPP Unemployment Income_per_person Inflation
## 137 1 1 1 1 1
## 18 1 1 1 1 1
## 3 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 1 1 1 1 1 0
## 1 1 1 1 1 0
## 1 1 1 1 0 1
## 3 1 1 1 0 1
## 3 1 1 0 1 1
## 1 1 0 1 1 0
## 1 1 0 1 1 0
## 0 2 3 4 4
## Cell_phones Internet_users CO2 Business_Freedom Judical_Effectiveness
## 137 1 1 1 1 1
## 18 1 1 1 1 1
## 3 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 0 0
## 1 1 1 1 0 0
## 1 1 1 1 0 0
## 2 1 1 0 1 1
## 1 1 1 1 1 1
## 1 1 1 1 0 0
## 1 0 0 0 1 1
## 3 0 0 0 1 1
## 3 1 1 1 1 1
## 1 1 1 1 0 0
## 1 0 0 1 1 1
## 5 5 6 6 6
## Economic_freedom Property_Rights GNI_Capita_PPP Education_Equality
## 137 1 1 1 1
## 18 1 1 1 1
## 3 1 1 1 0
## 1 1 1 1 0
## 2 1 1 0 1
## 1 1 1 0 1
## 2 0 0 1 1
## 1 0 0 0 1
## 1 0 0 0 0
## 2 1 1 1 1
## 1 1 1 0 1
## 1 0 0 0 1
## 1 1 1 1 1
## 3 1 1 1 0
## 3 1 1 1 1
## 1 0 0 1 1
## 1 1 1 0 0
## 6 6 8 9
## Democracy
## 137 1 0
## 18 0 1
## 3 1 1
## 1 0 2
## 2 1 1
## 1 0 2
## 2 1 4
## 1 1 5
## 1 1 6
## 2 1 1
## 1 1 2
## 1 1 6
## 1 0 5
## 3 0 6
## 3 0 2
## 1 1 6
## 1 1 6
## 27 97
vec1 <- c()
# Which are the variables with missing values?
for(i in 1:ncol(data)){
if(any(is.na(data[,i]))){
vec1 <- c(vec1,i)
}
}
# We use Random Forest imputations
imp=mice(data[,vec1], method = 'rf')
##
## iter imp variable
## 1 1 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 1 2 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 1 3 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 1 4 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 1 5 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 2 1 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 2 2 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 2 3 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 2 4 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 2 5 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 3 1 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 3 2 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 3 3 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 3 4 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 3 5 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 4 1 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 4 2 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 4 3 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 4 4 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 4 5 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 5 1 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 5 2 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 5 3 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 5 4 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 5 5 GNI_Capita_PPP CO2 Cell_phones Income_per_person Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## Warning: Number of logged events: 18
Logged events
# A warning is shown, we have 25 logged events
head(imp$loggedEvents, 3)
tail(imp$loggedEvents, 3)
imp$loggedEvents
# What we see is that it is computing the missing values of GNI_Capita
# with the variable Income, and vice versa.
# What we can assume by analyzing this is that this two variables are
# highly correlated. Let's check it:
pairs(data$GNI_Capita_PPP ~ data$Income_per_person)
# They are highly correlated.
# After doing some reseacrh this is because the variable GNI_capita_PPP represents the gross national income per capita adjusted to the purchasing power parity, and the variable income is the gross national income but not adjusted to the purchasing power parity. Now that we have notice this, we can remove the variable Income of our dataset (GNI per capita PPP is better to do comparitions between countries than GNI per capita).
data = data[, -10]
# Now, we do the multiple imputation again:
vec1 <- c()
for(i in 1:ncol(data)){
if(any(is.na(data[,i]))){
vec1 <- c(vec1,i)
}
}
imp=mice(data[,vec1], method = 'rf')
##
## iter imp variable
## 1 1 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 1 2 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 1 3 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 1 4 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 1 5 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 2 1 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 2 2 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 2 3 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 2 4 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 2 5 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 3 1 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 3 2 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 3 3 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 3 4 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 3 5 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 4 1 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 4 2 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 4 3 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 4 4 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 4 5 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 5 1 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 5 2 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 5 3 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 5 4 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
## 5 5 GNI_Capita_PPP CO2 Cell_phones Inflation Business_Freedom Internet_users Democracy Judical_Effectiveness Economic_freedom Property_Rights Unemployment GDP_capita_PPP Education_Equality
data_imp=mice::complete(imp)
data$GNI_Capita_PPP = data_imp$GNI_Capita_PPP
data$CO2 =data_imp$CO2
data$Cell_phones = data_imp$Cell_phones
data$Inflation = data_imp$Inflation
data$Business_Freedom = data_imp$Business_Freedom
data$Internet_users = data_imp$Internet_users
data$Democracy = data_imp$Democracy
data$Judical_Effectiveness = data_imp$Judical_Effectiveness
data$Economic_freedom = data_imp$Economic_freedom
data$Property_Rights = data_imp$Property_Rights
data$Unemployment = data_imp$Unemployment
data$GDP_capita_PPP = data_imp$GDP_capita_PPP
data$Education_Equality = data_imp$Education_Equality
# Now, we don't have any missing value:
any(is.na(data)==TRUE)
## [1] FALSE
Let’s add this categorical variable which later may be interesting for the graphs
data$Region = total.data$Region[-which(vec>6)]
data$Region= as.factor(data$Region)
With graphs:
# I am specially interested in the variable that measures the economic freedom of a country, so let's see if it has many outliers and we need to reduce noise.
ggplot(data = data)+aes(x=Unemployment, y = Economic_freedom, color =Region)+geom_point()
ggplot(data = data)+aes(x=Region, y = Economic_freedom, fill =Region)+geom_boxplot()
ggplot(data) +
aes(x = Economic_freedom) +
geom_histogram(bins = 30L, fill = "red") +
theme_minimal()
# With this 3 graphs we see clearly that, at least we are going to have one outlier (the value far away from the rest in the histogram)
The easiest way to identify the outliers with a graph is doing a boxplot
boxplot(data$Economic_freedom,ylab = "Economic Freedom")
# We have 3 outliers, but there is one that it is specially extreme, which is North Korea:
min(data$Economic_freedom)
## [1] 3
data$Country[which(data$Economic_freedom ==min(data$Economic_freedom))]
## [1] "Korea, North "
Another ways to identify the outliers:
# 1) With function outlier
idx = outlier(data$Economic_freedom, logical=T)
data$Country[idx]
## [1] "Korea, North "
# 2) 3 sigma method
mu <- mean(data$Economic_freedom)
sigma <- sd(data$Economic_freedom)
sum(data$Economic_freedom < mu - 3*sigma | data$Economic_freedom > mu + 3*sigma)
## [1] 1
data$Country[which(data$Economic_freedom < mu - 3*sigma | data$Economic_freedom > mu + 3*sigma)]
## [1] "Korea, North "
# 3)Identification by IQR:
QI <- quantile(data$Economic_freedom, 0.25, na.rm = TRUE)
QS <- quantile(data$Economic_freedom, 0.75, na.rm = TRUE)
IQR = QS-QI
sum(data$Economic_freedom < QI - 1.5*IQR | data$Economic_freedom > QS + 1.5*IQR)
## [1] 4
data$Country[which(data$Economic_freedom < QI - 1.5*IQR | data$Economic_freedom > QS + 1.5*IQR)]
## [1] "Afghanistan" "Cuba" "Korea, North " "Venezuela"
There may be allso outliers with respect to other variables, for instance, if we try to explain the GNI by economic freedom and unemployment rate:
# Let’s explain the GNI by economic freedom and unemployment rate:
lm.fit = lm(Economic_freedom ~GNI_Capita_PPP, data)
resid = residuals(lm.fit)
qplot(data$GNI_Capita_PPP, resid)
summary(lm(Economic_freedom ~ Country + GNI_Capita_PPP, data))
##
## Call:
## lm(formula = Economic_freedom ~ Country + GNI_Capita_PPP, data = data)
##
## Residuals:
## ALL 179 residuals are 0: no residual degrees of freedom!
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.950e+01 NaN NaN NaN
## CountryAlbania 3.710e+01 NaN NaN NaN
## CountryAlgeria 1.630e+01 NaN NaN NaN
## CountryAngola 2.310e+01 NaN NaN NaN
## CountryArgentina 2.060e+01 NaN NaN NaN
## CountryArmenia 3.580e+01 NaN NaN NaN
## CountryAustralia 4.820e+01 NaN NaN NaN
## CountryAustria 4.430e+01 NaN NaN NaN
## CountryAzerbaijan 3.210e+01 NaN NaN NaN
## CountryBahamas 3.920e+01 NaN NaN NaN
## CountryBahrain 3.250e+01 NaN NaN NaN
## CountryBangladesh 2.320e+01 NaN NaN NaN
## CountryBarbados 4.180e+01 NaN NaN NaN
## CountryBelarus 2.350e+01 NaN NaN NaN
## CountryBelgium 4.010e+01 NaN NaN NaN
## CountryBelize 2.710e+01 NaN NaN NaN
## CountryBenin 3.150e+01 NaN NaN NaN
## CountryBhutan 2.980e+01 NaN NaN NaN
## CountryBolivia 1.350e+01 NaN NaN NaN
## CountryBosnia and Herzegovina 3.390e+01 NaN NaN NaN
## CountryBotswana 3.530e+01 NaN NaN NaN
## CountryBrazil 2.380e+01 NaN NaN NaN
## CountryBrunei Darussalam 3.530e+01 NaN NaN NaN
## CountryBulgaria 4.150e+01 NaN NaN NaN
## CountryBurkina Faso 2.880e+01 NaN NaN NaN
## CountryBurundi 9.900e+00 NaN NaN NaN
## CountryCabo Verde 3.720e+01 NaN NaN NaN
## CountryCambodia 2.760e+01 NaN NaN NaN
## CountryCameroon 2.340e+01 NaN NaN NaN
## CountryCanada 4.710e+01 NaN NaN NaN
## CountryCentral African Republic 1.620e+01 NaN NaN NaN
## CountryChad 2.030e+01 NaN NaN NaN
## CountryChile 4.490e+01 NaN NaN NaN
## CountryChina 1.850e+01 NaN NaN NaN
## CountryColombia 3.560e+01 NaN NaN NaN
## CountryComoros 2.090e+01 NaN NaN NaN
## CountryCongo, Dem. Rep. 1.810e+01 NaN NaN NaN
## CountryCongo, Rep. 1.900e+01 NaN NaN NaN
## CountryCosta Rica 3.590e+01 NaN NaN NaN
## CountryCroatia 3.810e+01 NaN NaN NaN
## CountryCuba 4.724e-13 NaN NaN NaN
## CountryCyprus 4.340e+01 NaN NaN NaN
## CountryCzech Republic 4.490e+01 NaN NaN NaN
## CountryDenmark 4.850e+01 NaN NaN NaN
## CountryDjibouti 2.580e+01 NaN NaN NaN
## CountryDominica 2.490e+01 NaN NaN NaN
## CountryDominican Republic 3.350e+01 NaN NaN NaN
## CountryEcuador 2.480e+01 NaN NaN NaN
## CountryEgypt 1.960e+01 NaN NaN NaN
## CountryEl Salvador 3.010e+01 NaN NaN NaN
## CountryEquatorial Guinea 1.770e+01 NaN NaN NaN
## CountryEritrea 1.020e+01 NaN NaN NaN
## CountryEstonia 5.050e+01 NaN NaN NaN
## CountryEswatini 2.190e+01 NaN NaN NaN
## CountryEthiopia 2.010e+01 NaN NaN NaN
## CountryFiji 2.690e+01 NaN NaN NaN
## CountryFinland 4.880e+01 NaN NaN NaN
## CountryFrance 3.640e+01 NaN NaN NaN
## CountryGabon 2.630e+01 NaN NaN NaN
## CountryGambia 2.850e+01 NaN NaN NaN
## CountryGeorgia 4.230e+01 NaN NaN NaN
## CountryGermany 4.660e+01 NaN NaN NaN
## CountryGhana 3.030e+01 NaN NaN NaN
## CountryGreece 3.200e+01 NaN NaN NaN
## CountryGuatemala 3.370e+01 NaN NaN NaN
## CountryGuinea 2.470e+01 NaN NaN NaN
## CountryGuinea-Bissau 1.650e+01 NaN NaN NaN
## CountryGuyana 3.000e+01 NaN NaN NaN
## CountryHaiti 2.050e+01 NaN NaN NaN
## CountryHonduras 3.000e+01 NaN NaN NaN
## CountryHungary 3.740e+01 NaN NaN NaN
## CountryIceland 4.750e+01 NaN NaN NaN
## CountryIndia 2.440e+01 NaN NaN NaN
## CountryIndonesia 3.490e+01 NaN NaN NaN
## CountryIran 1.290e+01 NaN NaN NaN
## CountryIraq 2.340e+01 NaN NaN NaN
## CountryIreland 5.250e+01 NaN NaN NaN
## CountryIsrael 3.850e+01 NaN NaN NaN
## CountryItaly 3.590e+01 NaN NaN NaN
## CountryJamaica 3.790e+01 NaN NaN NaN
## CountryJapan 4.040e+01 NaN NaN NaN
## CountryJordan 3.060e+01 NaN NaN NaN
## CountryKazakhstan 3.490e+01 NaN NaN NaN
## CountryKenya 2.310e+01 NaN NaN NaN
## CountryKiribati 2.970e+01 NaN NaN NaN
## CountryKorea, North -2.650e+01 NaN NaN NaN
## CountryKorea, South 4.510e+01 NaN NaN NaN
## CountryKuwait 2.880e+01 NaN NaN NaN
## CountryKyrgyzstan 2.630e+01 NaN NaN NaN
## CountryLao P.D.R. 1.970e+01 NaN NaN NaN
## CountryLatvia 4.530e+01 NaN NaN NaN
## CountryLebanon 1.780e+01 NaN NaN NaN
## CountryLesotho 1.860e+01 NaN NaN NaN
## CountryLiberia 1.840e+01 NaN NaN NaN
## CountryLibya 4.230e+01 NaN NaN NaN
## CountryLithuania 4.630e+01 NaN NaN NaN
## CountryLuxembourg 5.110e+01 NaN NaN NaN
## CountryMacedonia 3.620e+01 NaN NaN NaN
## CountryMadagascar 2.940e+01 NaN NaN NaN
## CountryMalawi 2.350e+01 NaN NaN NaN
## CountryMalaysia 3.860e+01 NaN NaN NaN
## CountryMaldives 1.780e+01 NaN NaN NaN
## CountryMali 2.640e+01 NaN NaN NaN
## CountryMalta 4.200e+01 NaN NaN NaN
## CountryMauritania 2.580e+01 NaN NaN NaN
## CountryMauritius 4.140e+01 NaN NaN NaN
## CountryMexico 3.420e+01 NaN NaN NaN
## CountryMicronesia 3.150e+01 NaN NaN NaN
## CountryMoldova 3.180e+01 NaN NaN NaN
## CountryMongolia 3.440e+01 NaN NaN NaN
## CountryMontenegro 2.830e+01 NaN NaN NaN
## CountryMorocco 2.970e+01 NaN NaN NaN
## CountryMozambique 2.180e+01 NaN NaN NaN
## CountryMyanmar 2.010e+01 NaN NaN NaN
## CountryNamibia 2.970e+01 NaN NaN NaN
## CountryNepal 2.020e+01 NaN NaN NaN
## CountryNetherlands 5.000e+01 NaN NaN NaN
## CountryNew Zealand 5.110e+01 NaN NaN NaN
## CountryNicaragua 2.530e+01 NaN NaN NaN
## CountryNiger 2.540e+01 NaN NaN NaN
## CountryNigeria 2.490e+01 NaN NaN NaN
## CountryNorway 4.740e+01 NaN NaN NaN
## CountryOman 2.710e+01 NaN NaN NaN
## CountryPakistan 1.930e+01 NaN NaN NaN
## CountryPanama 3.590e+01 NaN NaN NaN
## CountryPapua New Guinea 2.510e+01 NaN NaN NaN
## CountryParaguay 3.340e+01 NaN NaN NaN
## CountryPeru 3.700e+01 NaN NaN NaN
## CountryPhilippines 3.160e+01 NaN NaN NaN
## CountryPoland 3.920e+01 NaN NaN NaN
## CountryPortugal 4.130e+01 NaN NaN NaN
## CountryQatar 3.820e+01 NaN NaN NaN
## CountryRomania 3.760e+01 NaN NaN NaN
## CountryRussia 2.660e+01 NaN NaN NaN
## CountryRwanda 2.760e+01 NaN NaN NaN
## CountrySaint Lucia 3.480e+01 NaN NaN NaN
## CountrySaint Vincent and the Grenadines 3.620e+01 NaN NaN NaN
## CountrySamoa 3.880e+01 NaN NaN NaN
## CountrySaudi Arabia 2.600e+01 NaN NaN NaN
## CountrySenegal 3.050e+01 NaN NaN NaN
## CountrySerbia 3.570e+01 NaN NaN NaN
## CountrySeychelles 3.160e+01 NaN NaN NaN
## CountrySierra Leone 2.250e+01 NaN NaN NaN
## CountrySingapore 5.490e+01 NaN NaN NaN
## CountrySlovakia 4.020e+01 NaN NaN NaN
## CountrySlovenia 4.100e+01 NaN NaN NaN
## CountrySolomon Islands 2.700e+01 NaN NaN NaN
## CountrySomalia 3.150e+01 NaN NaN NaN
## CountrySouth Africa 2.670e+01 NaN NaN NaN
## CountrySpain 3.870e+01 NaN NaN NaN
## CountrySri Lanka 2.380e+01 NaN NaN NaN
## CountrySudan 2.500e+00 NaN NaN NaN
## CountrySuriname 1.860e+01 NaN NaN NaN
## CountrySweden 4.840e+01 NaN NaN NaN
## CountrySwitzerland 5.470e+01 NaN NaN NaN
## CountrySyria 2.460e+01 NaN NaN NaN
## CountryTajikistan 2.020e+01 NaN NaN NaN
## CountryTanzania 3.000e+01 NaN NaN NaN
## CountryThailand 3.370e+01 NaN NaN NaN
## CountryTimor-Leste 1.680e+01 NaN NaN NaN
## CountryTogo 2.770e+01 NaN NaN NaN
## CountryTonga 3.130e+01 NaN NaN NaN
## CountryTrinidad and Tobago 2.930e+01 NaN NaN NaN
## CountryTunisia 2.470e+01 NaN NaN NaN
## CountryTurkey 2.740e+01 NaN NaN NaN
## CountryTurkmenistan 1.670e+01 NaN NaN NaN
## CountryUganda 2.470e+01 NaN NaN NaN
## CountryUkraine 2.460e+01 NaN NaN NaN
## CountryUnited Arab Emirates 4.070e+01 NaN NaN NaN
## CountryUnited Kingdom 4.320e+01 NaN NaN NaN
## CountryUnited States 4.260e+01 NaN NaN NaN
## CountryUruguay 4.050e+01 NaN NaN NaN
## CountryUzbekistan 2.620e+01 NaN NaN NaN
## CountryVanuatu 3.340e+01 NaN NaN NaN
## CountryVenezuela -4.700e+00 NaN NaN NaN
## CountryVietnam 3.110e+01 NaN NaN NaN
## CountryYemen 1.860e+01 NaN NaN NaN
## CountryZambia 1.920e+01 NaN NaN NaN
## CountryZimbabwe 3.600e+00 NaN NaN NaN
## GNI_Capita_PPP NA NA NA NA
##
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: NaN
## F-statistic: NaN on 178 and 0 DF, p-value: NA
resid %>% as.data.frame() %>% ggplot(aes(x=resid)) + geom_boxplot(fill="lightblue")
# The majority of the residuals are between (-20, 20), which is fine. However, there are a few values outside that interval (outliers). To reduce noise we can remove that values or discretize the variable.
Another outliers against other variables are:
idx = outlier(resid, logical=T)
data$Country[which(idx==TRUE)] # Again, North Korea.
## [1] "Korea, North "
which(resid < -20 | resid >20) # We have 6 countries with outlier residuals.
## 1 41 86 88 152 176 180
## 1 41 86 88 152 175 179
# We can see that there are 3-6 countries that are outliers for any
# combination of economic_freedom with any other variable:
lm.fit1 = lm(Economic_freedom ~Unemployment, data)
resid1 = residuals(lm.fit1)
qplot(data$Unemployment, resid1)
summary(lm(Economic_freedom ~ Country + Unemployment, data))
##
## Call:
## lm(formula = Economic_freedom ~ Country + Unemployment, data = data)
##
## Residuals:
## ALL 179 residuals are 0: no residual degrees of freedom!
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.950e+01 NaN NaN NaN
## CountryAlbania 3.710e+01 NaN NaN NaN
## CountryAlgeria 1.630e+01 NaN NaN NaN
## CountryAngola 2.310e+01 NaN NaN NaN
## CountryArgentina 2.060e+01 NaN NaN NaN
## CountryArmenia 3.580e+01 NaN NaN NaN
## CountryAustralia 4.820e+01 NaN NaN NaN
## CountryAustria 4.430e+01 NaN NaN NaN
## CountryAzerbaijan 3.210e+01 NaN NaN NaN
## CountryBahamas 3.920e+01 NaN NaN NaN
## CountryBahrain 3.250e+01 NaN NaN NaN
## CountryBangladesh 2.320e+01 NaN NaN NaN
## CountryBarbados 4.180e+01 NaN NaN NaN
## CountryBelarus 2.350e+01 NaN NaN NaN
## CountryBelgium 4.010e+01 NaN NaN NaN
## CountryBelize 2.710e+01 NaN NaN NaN
## CountryBenin 3.150e+01 NaN NaN NaN
## CountryBhutan 2.980e+01 NaN NaN NaN
## CountryBolivia 1.350e+01 NaN NaN NaN
## CountryBosnia and Herzegovina 3.390e+01 NaN NaN NaN
## CountryBotswana 3.530e+01 NaN NaN NaN
## CountryBrazil 2.380e+01 NaN NaN NaN
## CountryBrunei Darussalam 3.530e+01 NaN NaN NaN
## CountryBulgaria 4.150e+01 NaN NaN NaN
## CountryBurkina Faso 2.880e+01 NaN NaN NaN
## CountryBurundi 9.900e+00 NaN NaN NaN
## CountryCabo Verde 3.720e+01 NaN NaN NaN
## CountryCambodia 2.760e+01 NaN NaN NaN
## CountryCameroon 2.340e+01 NaN NaN NaN
## CountryCanada 4.710e+01 NaN NaN NaN
## CountryCentral African Republic 1.620e+01 NaN NaN NaN
## CountryChad 2.030e+01 NaN NaN NaN
## CountryChile 4.490e+01 NaN NaN NaN
## CountryChina 1.850e+01 NaN NaN NaN
## CountryColombia 3.560e+01 NaN NaN NaN
## CountryComoros 2.090e+01 NaN NaN NaN
## CountryCongo, Dem. Rep. 1.810e+01 NaN NaN NaN
## CountryCongo, Rep. 1.900e+01 NaN NaN NaN
## CountryCosta Rica 3.590e+01 NaN NaN NaN
## CountryCroatia 3.810e+01 NaN NaN NaN
## CountryCuba 4.724e-13 NaN NaN NaN
## CountryCyprus 4.340e+01 NaN NaN NaN
## CountryCzech Republic 4.490e+01 NaN NaN NaN
## CountryDenmark 4.850e+01 NaN NaN NaN
## CountryDjibouti 2.580e+01 NaN NaN NaN
## CountryDominica 2.490e+01 NaN NaN NaN
## CountryDominican Republic 3.350e+01 NaN NaN NaN
## CountryEcuador 2.480e+01 NaN NaN NaN
## CountryEgypt 1.960e+01 NaN NaN NaN
## CountryEl Salvador 3.010e+01 NaN NaN NaN
## CountryEquatorial Guinea 1.770e+01 NaN NaN NaN
## CountryEritrea 1.020e+01 NaN NaN NaN
## CountryEstonia 5.050e+01 NaN NaN NaN
## CountryEswatini 2.190e+01 NaN NaN NaN
## CountryEthiopia 2.010e+01 NaN NaN NaN
## CountryFiji 2.690e+01 NaN NaN NaN
## CountryFinland 4.880e+01 NaN NaN NaN
## CountryFrance 3.640e+01 NaN NaN NaN
## CountryGabon 2.630e+01 NaN NaN NaN
## CountryGambia 2.850e+01 NaN NaN NaN
## CountryGeorgia 4.230e+01 NaN NaN NaN
## CountryGermany 4.660e+01 NaN NaN NaN
## CountryGhana 3.030e+01 NaN NaN NaN
## CountryGreece 3.200e+01 NaN NaN NaN
## CountryGuatemala 3.370e+01 NaN NaN NaN
## CountryGuinea 2.470e+01 NaN NaN NaN
## CountryGuinea-Bissau 1.650e+01 NaN NaN NaN
## CountryGuyana 3.000e+01 NaN NaN NaN
## CountryHaiti 2.050e+01 NaN NaN NaN
## CountryHonduras 3.000e+01 NaN NaN NaN
## CountryHungary 3.740e+01 NaN NaN NaN
## CountryIceland 4.750e+01 NaN NaN NaN
## CountryIndia 2.440e+01 NaN NaN NaN
## CountryIndonesia 3.490e+01 NaN NaN NaN
## CountryIran 1.290e+01 NaN NaN NaN
## CountryIraq 2.340e+01 NaN NaN NaN
## CountryIreland 5.250e+01 NaN NaN NaN
## CountryIsrael 3.850e+01 NaN NaN NaN
## CountryItaly 3.590e+01 NaN NaN NaN
## CountryJamaica 3.790e+01 NaN NaN NaN
## CountryJapan 4.040e+01 NaN NaN NaN
## CountryJordan 3.060e+01 NaN NaN NaN
## CountryKazakhstan 3.490e+01 NaN NaN NaN
## CountryKenya 2.310e+01 NaN NaN NaN
## CountryKiribati 2.970e+01 NaN NaN NaN
## CountryKorea, North -2.650e+01 NaN NaN NaN
## CountryKorea, South 4.510e+01 NaN NaN NaN
## CountryKuwait 2.880e+01 NaN NaN NaN
## CountryKyrgyzstan 2.630e+01 NaN NaN NaN
## CountryLao P.D.R. 1.970e+01 NaN NaN NaN
## CountryLatvia 4.530e+01 NaN NaN NaN
## CountryLebanon 1.780e+01 NaN NaN NaN
## CountryLesotho 1.860e+01 NaN NaN NaN
## CountryLiberia 1.840e+01 NaN NaN NaN
## CountryLibya 4.230e+01 NaN NaN NaN
## CountryLithuania 4.630e+01 NaN NaN NaN
## CountryLuxembourg 5.110e+01 NaN NaN NaN
## CountryMacedonia 3.620e+01 NaN NaN NaN
## CountryMadagascar 2.940e+01 NaN NaN NaN
## CountryMalawi 2.350e+01 NaN NaN NaN
## CountryMalaysia 3.860e+01 NaN NaN NaN
## CountryMaldives 1.780e+01 NaN NaN NaN
## CountryMali 2.640e+01 NaN NaN NaN
## CountryMalta 4.200e+01 NaN NaN NaN
## CountryMauritania 2.580e+01 NaN NaN NaN
## CountryMauritius 4.140e+01 NaN NaN NaN
## CountryMexico 3.420e+01 NaN NaN NaN
## CountryMicronesia 3.150e+01 NaN NaN NaN
## CountryMoldova 3.180e+01 NaN NaN NaN
## CountryMongolia 3.440e+01 NaN NaN NaN
## CountryMontenegro 2.830e+01 NaN NaN NaN
## CountryMorocco 2.970e+01 NaN NaN NaN
## CountryMozambique 2.180e+01 NaN NaN NaN
## CountryMyanmar 2.010e+01 NaN NaN NaN
## CountryNamibia 2.970e+01 NaN NaN NaN
## CountryNepal 2.020e+01 NaN NaN NaN
## CountryNetherlands 5.000e+01 NaN NaN NaN
## CountryNew Zealand 5.110e+01 NaN NaN NaN
## CountryNicaragua 2.530e+01 NaN NaN NaN
## CountryNiger 2.540e+01 NaN NaN NaN
## CountryNigeria 2.490e+01 NaN NaN NaN
## CountryNorway 4.740e+01 NaN NaN NaN
## CountryOman 2.710e+01 NaN NaN NaN
## CountryPakistan 1.930e+01 NaN NaN NaN
## CountryPanama 3.590e+01 NaN NaN NaN
## CountryPapua New Guinea 2.510e+01 NaN NaN NaN
## CountryParaguay 3.340e+01 NaN NaN NaN
## CountryPeru 3.700e+01 NaN NaN NaN
## CountryPhilippines 3.160e+01 NaN NaN NaN
## CountryPoland 3.920e+01 NaN NaN NaN
## CountryPortugal 4.130e+01 NaN NaN NaN
## CountryQatar 3.820e+01 NaN NaN NaN
## CountryRomania 3.760e+01 NaN NaN NaN
## CountryRussia 2.660e+01 NaN NaN NaN
## CountryRwanda 2.760e+01 NaN NaN NaN
## CountrySaint Lucia 3.480e+01 NaN NaN NaN
## CountrySaint Vincent and the Grenadines 3.620e+01 NaN NaN NaN
## CountrySamoa 3.880e+01 NaN NaN NaN
## CountrySaudi Arabia 2.600e+01 NaN NaN NaN
## CountrySenegal 3.050e+01 NaN NaN NaN
## CountrySerbia 3.570e+01 NaN NaN NaN
## CountrySeychelles 3.160e+01 NaN NaN NaN
## CountrySierra Leone 2.250e+01 NaN NaN NaN
## CountrySingapore 5.490e+01 NaN NaN NaN
## CountrySlovakia 4.020e+01 NaN NaN NaN
## CountrySlovenia 4.100e+01 NaN NaN NaN
## CountrySolomon Islands 2.700e+01 NaN NaN NaN
## CountrySomalia 3.150e+01 NaN NaN NaN
## CountrySouth Africa 2.670e+01 NaN NaN NaN
## CountrySpain 3.870e+01 NaN NaN NaN
## CountrySri Lanka 2.380e+01 NaN NaN NaN
## CountrySudan 2.500e+00 NaN NaN NaN
## CountrySuriname 1.860e+01 NaN NaN NaN
## CountrySweden 4.840e+01 NaN NaN NaN
## CountrySwitzerland 5.470e+01 NaN NaN NaN
## CountrySyria 2.460e+01 NaN NaN NaN
## CountryTajikistan 2.020e+01 NaN NaN NaN
## CountryTanzania 3.000e+01 NaN NaN NaN
## CountryThailand 3.370e+01 NaN NaN NaN
## CountryTimor-Leste 1.680e+01 NaN NaN NaN
## CountryTogo 2.770e+01 NaN NaN NaN
## CountryTonga 3.130e+01 NaN NaN NaN
## CountryTrinidad and Tobago 2.930e+01 NaN NaN NaN
## CountryTunisia 2.470e+01 NaN NaN NaN
## CountryTurkey 2.740e+01 NaN NaN NaN
## CountryTurkmenistan 1.670e+01 NaN NaN NaN
## CountryUganda 2.470e+01 NaN NaN NaN
## CountryUkraine 2.460e+01 NaN NaN NaN
## CountryUnited Arab Emirates 4.070e+01 NaN NaN NaN
## CountryUnited Kingdom 4.320e+01 NaN NaN NaN
## CountryUnited States 4.260e+01 NaN NaN NaN
## CountryUruguay 4.050e+01 NaN NaN NaN
## CountryUzbekistan 2.620e+01 NaN NaN NaN
## CountryVanuatu 3.340e+01 NaN NaN NaN
## CountryVenezuela -4.700e+00 NaN NaN NaN
## CountryVietnam 3.110e+01 NaN NaN NaN
## CountryYemen 1.860e+01 NaN NaN NaN
## CountryZambia 1.920e+01 NaN NaN NaN
## CountryZimbabwe 3.600e+00 NaN NaN NaN
## Unemployment NA NA NA NA
##
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: NaN
## F-statistic: NaN on 178 and 0 DF, p-value: NA
resid1 %>% as.data.frame() %>% ggplot(aes(x=resid1)) + geom_boxplot(fill="lightblue")
which(resid1 < -30 | resid1 >30)
## 1 41 86 176
## 1 41 86 175
lm.fit2 = lm(Economic_freedom ~ Life_Expentancy, data)
resid2 = residuals(lm.fit2)
qplot(data$Life_Expentancy, resid2)
summary(lm(Economic_freedom ~ Country + Life_Expentancy, data))
##
## Call:
## lm(formula = Economic_freedom ~ Country + Life_Expentancy, data = data)
##
## Residuals:
## ALL 179 residuals are 0: no residual degrees of freedom!
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.950e+01 NaN NaN NaN
## CountryAlbania 3.710e+01 NaN NaN NaN
## CountryAlgeria 1.630e+01 NaN NaN NaN
## CountryAngola 2.310e+01 NaN NaN NaN
## CountryArgentina 2.060e+01 NaN NaN NaN
## CountryArmenia 3.580e+01 NaN NaN NaN
## CountryAustralia 4.820e+01 NaN NaN NaN
## CountryAustria 4.430e+01 NaN NaN NaN
## CountryAzerbaijan 3.210e+01 NaN NaN NaN
## CountryBahamas 3.920e+01 NaN NaN NaN
## CountryBahrain 3.250e+01 NaN NaN NaN
## CountryBangladesh 2.320e+01 NaN NaN NaN
## CountryBarbados 4.180e+01 NaN NaN NaN
## CountryBelarus 2.350e+01 NaN NaN NaN
## CountryBelgium 4.010e+01 NaN NaN NaN
## CountryBelize 2.710e+01 NaN NaN NaN
## CountryBenin 3.150e+01 NaN NaN NaN
## CountryBhutan 2.980e+01 NaN NaN NaN
## CountryBolivia 1.350e+01 NaN NaN NaN
## CountryBosnia and Herzegovina 3.390e+01 NaN NaN NaN
## CountryBotswana 3.530e+01 NaN NaN NaN
## CountryBrazil 2.380e+01 NaN NaN NaN
## CountryBrunei Darussalam 3.530e+01 NaN NaN NaN
## CountryBulgaria 4.150e+01 NaN NaN NaN
## CountryBurkina Faso 2.880e+01 NaN NaN NaN
## CountryBurundi 9.900e+00 NaN NaN NaN
## CountryCabo Verde 3.720e+01 NaN NaN NaN
## CountryCambodia 2.760e+01 NaN NaN NaN
## CountryCameroon 2.340e+01 NaN NaN NaN
## CountryCanada 4.710e+01 NaN NaN NaN
## CountryCentral African Republic 1.620e+01 NaN NaN NaN
## CountryChad 2.030e+01 NaN NaN NaN
## CountryChile 4.490e+01 NaN NaN NaN
## CountryChina 1.850e+01 NaN NaN NaN
## CountryColombia 3.560e+01 NaN NaN NaN
## CountryComoros 2.090e+01 NaN NaN NaN
## CountryCongo, Dem. Rep. 1.810e+01 NaN NaN NaN
## CountryCongo, Rep. 1.900e+01 NaN NaN NaN
## CountryCosta Rica 3.590e+01 NaN NaN NaN
## CountryCroatia 3.810e+01 NaN NaN NaN
## CountryCuba 4.724e-13 NaN NaN NaN
## CountryCyprus 4.340e+01 NaN NaN NaN
## CountryCzech Republic 4.490e+01 NaN NaN NaN
## CountryDenmark 4.850e+01 NaN NaN NaN
## CountryDjibouti 2.580e+01 NaN NaN NaN
## CountryDominica 2.490e+01 NaN NaN NaN
## CountryDominican Republic 3.350e+01 NaN NaN NaN
## CountryEcuador 2.480e+01 NaN NaN NaN
## CountryEgypt 1.960e+01 NaN NaN NaN
## CountryEl Salvador 3.010e+01 NaN NaN NaN
## CountryEquatorial Guinea 1.770e+01 NaN NaN NaN
## CountryEritrea 1.020e+01 NaN NaN NaN
## CountryEstonia 5.050e+01 NaN NaN NaN
## CountryEswatini 2.190e+01 NaN NaN NaN
## CountryEthiopia 2.010e+01 NaN NaN NaN
## CountryFiji 2.690e+01 NaN NaN NaN
## CountryFinland 4.880e+01 NaN NaN NaN
## CountryFrance 3.640e+01 NaN NaN NaN
## CountryGabon 2.630e+01 NaN NaN NaN
## CountryGambia 2.850e+01 NaN NaN NaN
## CountryGeorgia 4.230e+01 NaN NaN NaN
## CountryGermany 4.660e+01 NaN NaN NaN
## CountryGhana 3.030e+01 NaN NaN NaN
## CountryGreece 3.200e+01 NaN NaN NaN
## CountryGuatemala 3.370e+01 NaN NaN NaN
## CountryGuinea 2.470e+01 NaN NaN NaN
## CountryGuinea-Bissau 1.650e+01 NaN NaN NaN
## CountryGuyana 3.000e+01 NaN NaN NaN
## CountryHaiti 2.050e+01 NaN NaN NaN
## CountryHonduras 3.000e+01 NaN NaN NaN
## CountryHungary 3.740e+01 NaN NaN NaN
## CountryIceland 4.750e+01 NaN NaN NaN
## CountryIndia 2.440e+01 NaN NaN NaN
## CountryIndonesia 3.490e+01 NaN NaN NaN
## CountryIran 1.290e+01 NaN NaN NaN
## CountryIraq 2.340e+01 NaN NaN NaN
## CountryIreland 5.250e+01 NaN NaN NaN
## CountryIsrael 3.850e+01 NaN NaN NaN
## CountryItaly 3.590e+01 NaN NaN NaN
## CountryJamaica 3.790e+01 NaN NaN NaN
## CountryJapan 4.040e+01 NaN NaN NaN
## CountryJordan 3.060e+01 NaN NaN NaN
## CountryKazakhstan 3.490e+01 NaN NaN NaN
## CountryKenya 2.310e+01 NaN NaN NaN
## CountryKiribati 2.970e+01 NaN NaN NaN
## CountryKorea, North -2.650e+01 NaN NaN NaN
## CountryKorea, South 4.510e+01 NaN NaN NaN
## CountryKuwait 2.880e+01 NaN NaN NaN
## CountryKyrgyzstan 2.630e+01 NaN NaN NaN
## CountryLao P.D.R. 1.970e+01 NaN NaN NaN
## CountryLatvia 4.530e+01 NaN NaN NaN
## CountryLebanon 1.780e+01 NaN NaN NaN
## CountryLesotho 1.860e+01 NaN NaN NaN
## CountryLiberia 1.840e+01 NaN NaN NaN
## CountryLibya 4.230e+01 NaN NaN NaN
## CountryLithuania 4.630e+01 NaN NaN NaN
## CountryLuxembourg 5.110e+01 NaN NaN NaN
## CountryMacedonia 3.620e+01 NaN NaN NaN
## CountryMadagascar 2.940e+01 NaN NaN NaN
## CountryMalawi 2.350e+01 NaN NaN NaN
## CountryMalaysia 3.860e+01 NaN NaN NaN
## CountryMaldives 1.780e+01 NaN NaN NaN
## CountryMali 2.640e+01 NaN NaN NaN
## CountryMalta 4.200e+01 NaN NaN NaN
## CountryMauritania 2.580e+01 NaN NaN NaN
## CountryMauritius 4.140e+01 NaN NaN NaN
## CountryMexico 3.420e+01 NaN NaN NaN
## CountryMicronesia 3.150e+01 NaN NaN NaN
## CountryMoldova 3.180e+01 NaN NaN NaN
## CountryMongolia 3.440e+01 NaN NaN NaN
## CountryMontenegro 2.830e+01 NaN NaN NaN
## CountryMorocco 2.970e+01 NaN NaN NaN
## CountryMozambique 2.180e+01 NaN NaN NaN
## CountryMyanmar 2.010e+01 NaN NaN NaN
## CountryNamibia 2.970e+01 NaN NaN NaN
## CountryNepal 2.020e+01 NaN NaN NaN
## CountryNetherlands 5.000e+01 NaN NaN NaN
## CountryNew Zealand 5.110e+01 NaN NaN NaN
## CountryNicaragua 2.530e+01 NaN NaN NaN
## CountryNiger 2.540e+01 NaN NaN NaN
## CountryNigeria 2.490e+01 NaN NaN NaN
## CountryNorway 4.740e+01 NaN NaN NaN
## CountryOman 2.710e+01 NaN NaN NaN
## CountryPakistan 1.930e+01 NaN NaN NaN
## CountryPanama 3.590e+01 NaN NaN NaN
## CountryPapua New Guinea 2.510e+01 NaN NaN NaN
## CountryParaguay 3.340e+01 NaN NaN NaN
## CountryPeru 3.700e+01 NaN NaN NaN
## CountryPhilippines 3.160e+01 NaN NaN NaN
## CountryPoland 3.920e+01 NaN NaN NaN
## CountryPortugal 4.130e+01 NaN NaN NaN
## CountryQatar 3.820e+01 NaN NaN NaN
## CountryRomania 3.760e+01 NaN NaN NaN
## CountryRussia 2.660e+01 NaN NaN NaN
## CountryRwanda 2.760e+01 NaN NaN NaN
## CountrySaint Lucia 3.480e+01 NaN NaN NaN
## CountrySaint Vincent and the Grenadines 3.620e+01 NaN NaN NaN
## CountrySamoa 3.880e+01 NaN NaN NaN
## CountrySaudi Arabia 2.600e+01 NaN NaN NaN
## CountrySenegal 3.050e+01 NaN NaN NaN
## CountrySerbia 3.570e+01 NaN NaN NaN
## CountrySeychelles 3.160e+01 NaN NaN NaN
## CountrySierra Leone 2.250e+01 NaN NaN NaN
## CountrySingapore 5.490e+01 NaN NaN NaN
## CountrySlovakia 4.020e+01 NaN NaN NaN
## CountrySlovenia 4.100e+01 NaN NaN NaN
## CountrySolomon Islands 2.700e+01 NaN NaN NaN
## CountrySomalia 3.150e+01 NaN NaN NaN
## CountrySouth Africa 2.670e+01 NaN NaN NaN
## CountrySpain 3.870e+01 NaN NaN NaN
## CountrySri Lanka 2.380e+01 NaN NaN NaN
## CountrySudan 2.500e+00 NaN NaN NaN
## CountrySuriname 1.860e+01 NaN NaN NaN
## CountrySweden 4.840e+01 NaN NaN NaN
## CountrySwitzerland 5.470e+01 NaN NaN NaN
## CountrySyria 2.460e+01 NaN NaN NaN
## CountryTajikistan 2.020e+01 NaN NaN NaN
## CountryTanzania 3.000e+01 NaN NaN NaN
## CountryThailand 3.370e+01 NaN NaN NaN
## CountryTimor-Leste 1.680e+01 NaN NaN NaN
## CountryTogo 2.770e+01 NaN NaN NaN
## CountryTonga 3.130e+01 NaN NaN NaN
## CountryTrinidad and Tobago 2.930e+01 NaN NaN NaN
## CountryTunisia 2.470e+01 NaN NaN NaN
## CountryTurkey 2.740e+01 NaN NaN NaN
## CountryTurkmenistan 1.670e+01 NaN NaN NaN
## CountryUganda 2.470e+01 NaN NaN NaN
## CountryUkraine 2.460e+01 NaN NaN NaN
## CountryUnited Arab Emirates 4.070e+01 NaN NaN NaN
## CountryUnited Kingdom 4.320e+01 NaN NaN NaN
## CountryUnited States 4.260e+01 NaN NaN NaN
## CountryUruguay 4.050e+01 NaN NaN NaN
## CountryUzbekistan 2.620e+01 NaN NaN NaN
## CountryVanuatu 3.340e+01 NaN NaN NaN
## CountryVenezuela -4.700e+00 NaN NaN NaN
## CountryVietnam 3.110e+01 NaN NaN NaN
## CountryYemen 1.860e+01 NaN NaN NaN
## CountryZambia 1.920e+01 NaN NaN NaN
## CountryZimbabwe 3.600e+00 NaN NaN NaN
## Life_Expentancy NA NA NA NA
##
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: NaN
## F-statistic: NaN on 178 and 0 DF, p-value: NA
resid2 %>% as.data.frame() %>% ggplot(aes(x=resid2)) + geom_boxplot(fill="lightblue")
which(resid2 < -19 | resid2 >19)
## 41 75 86 152 176
## 41 75 86 152 175
We have seen many ways to identify the outliers in this variable. In general, the outliers that we get with the different methods are the same. We could remove the extreme values to reduce noise, but as I don’t want to remove more rows of our dataset, I believe that the best decision is to discretize the variable to reduce noise.
Discretize the variable
# I will discretize them in the same the way that the website
# where I got the data from does it: https://www.heritage.org/index/ranking
# 5 levels:
data$Economic_freedom[data$Economic_freedom >= 80] <- 'Free'
data$Economic_freedom[data$Economic_freedom <= 79.9 &
data$Economic_freedom >= 70] <- 'Mostly Free'
data$Economic_freedom[data$Economic_freedom <= 69.9 &
data$Economic_freedom >= 60] <- 'Moderately Free'
data$Economic_freedom[data$Economic_freedom <= 59.9 &
data$Economic_freedom >= 50] <- 'Mostly Unfree'
data$Economic_freedom[data$Economic_freedom <= 49.9] <- 'Repressed'
# The counts are:
table(data$Economic_freedom)
##
## Free Moderately Free Mostly Free Mostly Unfree Repressed
## 6 52 28 59 34
prop.table(table(data$Economic_freedom))
##
## Free Moderately Free Mostly Free Mostly Unfree Repressed
## 0.03351955 0.29050279 0.15642458 0.32960894 0.18994413
# Let's reorder the levels:
data$Economic_freedom = factor(data$Economic_freedom, levels =
c('Free', 'Mostly Free', 'Moderately Free',
'Mostly Unfree', 'Repressed'))
# Let's make some graphs to understand better the relations between variables:
# GNI per capita vs Life Expectancy grouping countries by economic freedom level:
ggplot(data, aes(y=GNI_Capita_PPP, x=Life_Expentancy,
group=Economic_freedom, color=Unemployment))+
scale_x_sqrt(breaks=c(0.05,0.1), label=c("5%","10%"))+
geom_point(alpha=0.5) +
geom_smooth(method=lm,se=F, formula = y~x) +
facet_wrap(~ Economic_freedom) +
scale_color_gradient(low="green", high="red") +
theme_gray() +
labs(title = "GNI per capita PPP vs Life Expentancy",
caption="Alvaro Martin",
x = "", y = "")
# More developed countries (that we have seen that are the more free) emit more CO2 emissions:
ggplot(data)+aes(y=CO2, fill = Economic_freedom)+geom_boxplot()+
facet_wrap(~Economic_freedom)+ theme(legend.position = 'none')
# Los paises mas desarrollados son responsables de la mayor mparte de las emisiones
# In poor ccuntries, women have more kids:
ggplot(data)+aes(x=GDP_capita_PPP, y = Fertility)+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# The number of cell phones and internet users is related.
ggplot(data)+aes(x = Internet_users, y = Cell_phones)+geom_quantile()+geom_point()
## Smoothing formula not specified. Using: y ~ x
# Let's check it by calculating the correlation
cor(data$Cell_phones, data$Internet_users, method = c("pearson", "kendall", "spearman"))
## [1] 0.7966409
# Another couple of variables that are related:
ggscatter(data, x = "Adult_Mortality", y = "Infant_Mortality",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Adult_Mortality", ylab = "Infant_Mortality")
## `geom_smooth()` using formula 'y ~ x'
# Democracy by region
ggplot(data)+aes(y=Democracy, fill = Region)+geom_bar()+facet_wrap(~Region)
# Europe and America are the continents with more advanced democracies
# How many countries we have from each region?
ggplot(data, aes(x="", y="", fill=Region)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0)
# What is the mean education equality rate for each group of countries in terms of economic freedom
value <- c(mean(data$Education_Equality[which(data$Economic_freedom=='Free')]),
mean(data$Education_Equality[which(data$Economic_freedom=='Mostly Free')]),
mean(data$Education_Equality[which(data$Economic_freedom=='Moderately Free')]),
mean(data$Education_Equality[which(data$Economic_freedom=='Mostly Unfree')]),
mean(data$Education_Equality[which(data$Economic_freedom=='Repressed')]))
group <- c("Free", "Mostly-Free", "Moderately-Free", "Mostly-Unfree", "Repressed")
df <- data.frame(group,value)
ggplot(df, aes(group, value)) + geom_linerange(aes(x = group, ymin = 0.8, ymax = value),
color = "lightgray", size = 1.5)+ geom_point(aes(color = group), size = 3)+
ggpubr::color_palette("jco")+theme_pubclean()+ theme(legend.position = 'none')
# Freer countries are more egalitarians.
# Business freedom by region
ggplot(data,(aes(x=Region, y = Business_Freedom, fill = Region)))+geom_violin(scale = "area")+
geom_dotplot(binaxis='y', stackdir='center', dotsize=0.5)
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
# Infant moratlity is extremely high in sub-saharian countries:
ggplot(data, aes(x=reorder(Region, Under5_mortality), y = Under5_mortality, fill = Region))+geom_boxplot()
# property rights vs judical effectiveness by region:
ggplot(data, aes(x=Judical_Effectiveness, y=Property_Rights) ) +
stat_density_2d(aes(fill = ..level..), geom = "polygon")+
theme(legend.position = 'none')+facet_wrap(~Region)
# property rights and judical effectiveness are related:
# In this density chart, a lighter blue indicates higher density
# With the density chart we can see both the correlation between variables and where there is a bigger concentration of values. For instance, we can see in Europe most of the values are in the top right part of the charge (high Judical Effectiveness and Property Rights)
# PCA:
# To use PCA, we need that all the variables are numeric, so:
Economic_Freedom <- data_imp$Economic_freedom
data_num = cbind(data[,2:14], data[,16:19], Economic_Freedom)
# From dimension 15 to dimension 2
boxplot(data_num, las=2, col="darkblue")
# scale or not to scale?
boxplot(scale(data_num), las=2, col="darkblue")
# En nuestro caso yo creo q está claro que necesitamos SCALE
# With the following command we can see the correlation between all the variables
ggcorr(data_num, label = T)
# We can check some that some relations between varibales that we
# previously study through graphs were correct.
# (Adult moratlity and Infant Moratlity, Property Rights and Jusical Effectivenes, etc.)
# Now, with PCA we are going to reduce from dimension 20 to dim 2:
pca = prcomp(data_num, scale=T) # Notice that we have scaled the data
# pca = princomp(nba, cor=T) # the same, but using SVD instead of eigen decomposition
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.1514 1.4017 1.24061 1.01963 0.99543 0.81557 0.64431
## Proportion of Variance 0.5517 0.1092 0.08551 0.05776 0.05505 0.03695 0.02306
## Cumulative Proportion 0.5517 0.6609 0.74639 0.80415 0.85920 0.89615 0.91922
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.5401 0.52120 0.44410 0.41664 0.37850 0.35141 0.31566
## Proportion of Variance 0.0162 0.01509 0.01096 0.00964 0.00796 0.00686 0.00554
## Cumulative Proportion 0.9354 0.95051 0.96147 0.97111 0.97907 0.98593 0.99147
## PC15 PC16 PC17 PC18
## Standard deviation 0.27072 0.26127 0.08250 0.07234
## Proportion of Variance 0.00407 0.00379 0.00038 0.00029
## Cumulative Proportion 0.99554 0.99933 0.99971 1.00000
# ANother way to see the correlation matrix:
R = cor(data_num) # correlation matrix
eigen(R)
## eigen() decomposition
## $values
## [1] 9.931137710 1.964831729 1.539125347 1.039642132 0.990873170 0.665147710
## [7] 0.415129210 0.291655141 0.271653579 0.197225728 0.173592002 0.143265069
## [13] 0.123486262 0.099640990 0.073291301 0.068263840 0.006805486 0.005233594
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.257670000 -0.28386644 0.132511618 0.060474260 0.02113734
## [2,] -0.261019809 -0.10674269 0.317686994 -0.036159733 0.17791478
## [3,] 0.255273417 -0.21190446 0.009050281 -0.136595989 0.28799217
## [4,] 0.273545877 -0.28667851 0.121323767 -0.015365822 0.15796410
## [5,] -0.286217833 0.22102565 -0.052976365 0.083638045 -0.18223484
## [6,] 0.264642470 -0.32262191 0.143422126 0.009209958 0.17020140
## [7,] -0.170392860 0.05201517 0.526339343 -0.264099981 -0.02539185
## [8,] -0.277167904 -0.03758093 0.140230445 -0.024794451 0.21853109
## [9,] 0.039842021 0.40710942 0.198372789 0.428136361 0.56855485
## [10,] -0.280835392 -0.14423675 -0.116035068 -0.048055691 -0.13181052
## [11,] -0.268324610 -0.14583993 0.147972773 0.094300558 0.18159978
## [12,] -0.136913211 -0.25254999 -0.391516363 0.440356711 0.23317683
## [13,] -0.253865311 -0.25518921 -0.188442001 0.053668496 0.21816029
## [14,] -0.273005474 -0.24795105 -0.057229899 0.003271300 0.09367161
## [15,] -0.007836825 0.10979334 -0.331813876 -0.701768262 0.45036621
## [16,] -0.265032602 -0.10122506 0.343577890 -0.069749948 0.06315639
## [17,] -0.156584284 0.30345203 -0.196863902 -0.065425276 0.19412563
## [18,] -0.250339292 -0.33362261 -0.113824569 -0.075928318 -0.14345286
## [,6] [,7] [,8] [,9] [,10]
## [1,] -0.025418577 0.029546915 -0.523509471 0.02698708 -0.16574794
## [2,] -0.045546537 -0.047491845 -0.042712867 0.39827824 -0.30385001
## [3,] 0.282480628 -0.048894507 0.466020174 -0.11138144 0.20705128
## [4,] 0.072399929 -0.036781257 -0.067011632 0.05641391 -0.12243970
## [5,] -0.184566415 0.030420899 -0.234926103 0.09054251 -0.04643238
## [6,] 0.054451631 -0.028820763 -0.074446717 0.05499467 -0.11406407
## [7,] 0.167690046 0.530603641 -0.162936424 -0.39872988 0.20045182
## [8,] -0.067452839 0.146619907 0.224911409 0.05243396 0.23841552
## [9,] -0.164932845 -0.181724690 0.006781735 -0.40545438 -0.19322089
## [10,] 0.014867197 -0.178655996 0.318382476 -0.22748244 -0.33171337
## [11,] -0.122091114 -0.196586692 0.084182071 0.37326185 0.44672525
## [12,] -0.005194540 0.669325617 0.075022652 0.08446032 -0.12742421
## [13,] 0.009801686 -0.170702717 -0.392510688 -0.15669527 0.33620321
## [14,] 0.088416410 -0.303660057 -0.178277632 -0.27088157 0.04064692
## [15,] -0.344926767 0.104148625 -0.098424976 0.02565909 -0.14467749
## [16,] 0.055020548 -0.003606066 0.126722028 0.15973214 -0.39696847
## [17,] 0.817559902 -0.046389489 -0.190047891 0.18831822 -0.10600375
## [18,] 0.043199676 -0.048873038 0.052363733 -0.36046412 -0.20626694
## [,11] [,12] [,13] [,14] [,15] [,16]
## [1,] 0.01047177 -0.37744098 0.548178989 0.20768532 -0.18756116 -0.06273718
## [2,] -0.19599888 -0.26859869 -0.176814095 -0.03071274 0.55176453 -0.28388114
## [3,] -0.21225652 -0.40376265 0.026883022 -0.09214541 -0.04669883 -0.06662766
## [4,] 0.13243664 0.41809289 -0.173661497 0.11144936 0.11562853 0.12875187
## [5,] 0.02206691 0.03827488 0.003833234 0.01562716 -0.01743323 0.02104828
## [6,] 0.15927375 0.38151866 -0.109604595 0.07912200 0.09518255 0.08704394
## [7,] -0.21092237 0.10643343 -0.065002767 0.15670463 0.01618362 -0.02249547
## [8,] 0.77435484 -0.10079168 0.277429706 0.08394239 0.14980646 0.00619606
## [9,] -0.07991078 0.05657374 0.110903930 -0.02934633 0.03371643 0.02863997
## [10,] -0.10579883 -0.02716694 0.068856739 0.72073570 0.04314080 0.18256212
## [11,] -0.38810861 0.35333907 0.339431751 0.10930211 -0.19748002 -0.01344081
## [12,] -0.11009151 0.02301842 -0.075818152 0.02817308 -0.10272989 -0.09348576
## [13,] -0.03921756 -0.25764932 -0.280599234 -0.03487717 0.16625816 0.53421326
## [14,] 0.17030944 0.04687785 -0.284226001 0.02607905 -0.32977903 -0.65060087
## [15,] -0.05979428 0.07734440 0.055652677 0.01935091 -0.08657902 -0.02637891
## [16,] 0.07262592 -0.10209993 -0.129929638 -0.28935751 -0.58375662 0.36421183
## [17,] 0.03631152 0.11933305 0.171867281 0.05019239 0.02742122 0.02542031
## [18,] -0.08873815 0.22475321 0.443952943 -0.52471398 0.27348331 0.01296648
## [,17] [,18]
## [1,] -0.046488149 -0.009987675
## [2,] -0.002678718 0.040946536
## [3,] -0.036322639 -0.456492433
## [4,] -0.700471668 -0.135250566
## [5,] -0.043652817 -0.849009982
## [6,] 0.707238947 -0.211806259
## [7,] 0.006002004 -0.007558793
## [8,] -0.034345938 -0.006984421
## [9,] -0.006794095 -0.004089904
## [10,] 0.023526060 -0.003893175
## [11,] -0.010762926 0.038807822
## [12,] -0.010029600 0.017158024
## [13,] 0.015796409 0.049106822
## [14,] -0.019381040 -0.003233845
## [15,] 0.011018207 0.005824470
## [16,] -0.006146502 0.025423657
## [17,] 0.016820195 0.002259013
## [18,] -0.025696349 -0.028638351
# How many components?
fviz_screeplot(pca, addlabels = TRUE)
# With one component we can explain 55% of the variance of our data, with
# 2 more than 60% and with 3 components almost 75%.
# First component:
barplot(pca$rotation[,1], las=2, col="darkblue")
# Re-escribir esto q es un copia pega:
# Squared loadings are easier to interpret than the loadings
# I.e. they are like percentages (numbers between 0 and 1)
# So let's plot squared loadings instead
# They are called contribution of variables to components
# So let's plot squared loadings instead
# They are called contribution of variables to component
fviz_contrib(pca, choice = "var", axes = 1)
# Re-escribir:
# The red dashed line on the graph above indicates the expected average contribution
# Now we can rank the countries by their first PC scores:
names = data[,1]
# The best
names[order(pca$x[,1])][(length(names)-10):length(names)]
## [1] "Iceland" "Denmark" "United States" "Singapore"
## [5] "Sweden" "Australia" "Ireland" "Netherlands"
## [9] "Switzerland" "Norway" "Luxembourg"
# Another way (doesn't give us the exact same result, but it is very similar)
# to see the best 10 countries according to the first principal component:
calculateScore = function(data) {
return(sum((pca$rotation[, 1]*data)^2))
}
data$Country[sort.int(apply(data_num, 1, calculateScore), decreasing = T, index.return = T)$ix[1:10]]
## [1] "Luxembourg" "Singapore" "Ireland"
## [4] "Qatar" "Switzerland" "Norway"
## [7] "Brunei Darussalam" "United States" "Denmark"
## [10] "Netherlands"
# The worst
names[order(pca$x[,1])][1:10]
## [1] "Afghanistan" "Sierra Leone"
## [3] "Chad" "Congo, Dem. Rep."
## [5] "Niger" "Central African Republic"
## [7] "Guinea-Bissau" "Liberia"
## [9] "Burundi" "Angola"
# Now, let's compute the second component
barplot(pca$rotation[,2], las=2, col="lawngreen")
# Contribution of variables to second component
fviz_contrib(pca, choice = "var", axes = 2)
# Now we can rank the countries by their second PC scores:
names[order(pca$x[,2])][1:10] # Countries with high infant mortality
## [1] "Sierra Leone" "Niger" "Mali" "Somalia"
## [5] "Burkina Faso" "Guinea-Bissau" "Chad" "Angola"
## [9] "Benin" "Liberia"
names[order(pca$x[,2])][(length(names)-10):length(names)]
## [1] "Dominica" "Uzbekistan" "China" "Algeria"
## [5] "Syria" "Belarus" "Turkmenistan" "Iran"
## [9] "Cuba" "Venezuela" "Korea, North "
Firs look:
# To use PCA, we need that all the variables are numeric, so:
Economic_Freedom <- data_imp$Economic_freedom
data_num = cbind(data[,2:14], data[,16:19], Economic_Freedom)
# We need to sacale:
boxplot(data_num, las=2, col="darkblue")
boxplot(scale(data_num), las=2, col="darkblue")
# With the following command we can see the correlation between all the variables
ggcorr(data_num, label = T)
# We can check some that some relations between variables that we
# previously study through graphs were correct.
# (Adult moratlity and Infant Moratlity, Property Rights and Jusical Effectivenes, etc.)
# Another way to see the correlation matrix:
R = cor(data_num) # correlation matrix
eigen(R)
## eigen() decomposition
## $values
## [1] 9.931137710 1.964831729 1.539125347 1.039642132 0.990873170 0.665147710
## [7] 0.415129210 0.291655141 0.271653579 0.197225728 0.173592002 0.143265069
## [13] 0.123486262 0.099640990 0.073291301 0.068263840 0.006805486 0.005233594
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.257670000 -0.28386644 0.132511618 0.060474260 0.02113734
## [2,] -0.261019809 -0.10674269 0.317686994 -0.036159733 0.17791478
## [3,] 0.255273417 -0.21190446 0.009050281 -0.136595989 0.28799217
## [4,] 0.273545877 -0.28667851 0.121323767 -0.015365822 0.15796410
## [5,] -0.286217833 0.22102565 -0.052976365 0.083638045 -0.18223484
## [6,] 0.264642470 -0.32262191 0.143422126 0.009209958 0.17020140
## [7,] -0.170392860 0.05201517 0.526339343 -0.264099981 -0.02539185
## [8,] -0.277167904 -0.03758093 0.140230445 -0.024794451 0.21853109
## [9,] 0.039842021 0.40710942 0.198372789 0.428136361 0.56855485
## [10,] -0.280835392 -0.14423675 -0.116035068 -0.048055691 -0.13181052
## [11,] -0.268324610 -0.14583993 0.147972773 0.094300558 0.18159978
## [12,] -0.136913211 -0.25254999 -0.391516363 0.440356711 0.23317683
## [13,] -0.253865311 -0.25518921 -0.188442001 0.053668496 0.21816029
## [14,] -0.273005474 -0.24795105 -0.057229899 0.003271300 0.09367161
## [15,] -0.007836825 0.10979334 -0.331813876 -0.701768262 0.45036621
## [16,] -0.265032602 -0.10122506 0.343577890 -0.069749948 0.06315639
## [17,] -0.156584284 0.30345203 -0.196863902 -0.065425276 0.19412563
## [18,] -0.250339292 -0.33362261 -0.113824569 -0.075928318 -0.14345286
## [,6] [,7] [,8] [,9] [,10]
## [1,] -0.025418577 0.029546915 -0.523509471 0.02698708 -0.16574794
## [2,] -0.045546537 -0.047491845 -0.042712867 0.39827824 -0.30385001
## [3,] 0.282480628 -0.048894507 0.466020174 -0.11138144 0.20705128
## [4,] 0.072399929 -0.036781257 -0.067011632 0.05641391 -0.12243970
## [5,] -0.184566415 0.030420899 -0.234926103 0.09054251 -0.04643238
## [6,] 0.054451631 -0.028820763 -0.074446717 0.05499467 -0.11406407
## [7,] 0.167690046 0.530603641 -0.162936424 -0.39872988 0.20045182
## [8,] -0.067452839 0.146619907 0.224911409 0.05243396 0.23841552
## [9,] -0.164932845 -0.181724690 0.006781735 -0.40545438 -0.19322089
## [10,] 0.014867197 -0.178655996 0.318382476 -0.22748244 -0.33171337
## [11,] -0.122091114 -0.196586692 0.084182071 0.37326185 0.44672525
## [12,] -0.005194540 0.669325617 0.075022652 0.08446032 -0.12742421
## [13,] 0.009801686 -0.170702717 -0.392510688 -0.15669527 0.33620321
## [14,] 0.088416410 -0.303660057 -0.178277632 -0.27088157 0.04064692
## [15,] -0.344926767 0.104148625 -0.098424976 0.02565909 -0.14467749
## [16,] 0.055020548 -0.003606066 0.126722028 0.15973214 -0.39696847
## [17,] 0.817559902 -0.046389489 -0.190047891 0.18831822 -0.10600375
## [18,] 0.043199676 -0.048873038 0.052363733 -0.36046412 -0.20626694
## [,11] [,12] [,13] [,14] [,15] [,16]
## [1,] 0.01047177 -0.37744098 0.548178989 0.20768532 -0.18756116 -0.06273718
## [2,] -0.19599888 -0.26859869 -0.176814095 -0.03071274 0.55176453 -0.28388114
## [3,] -0.21225652 -0.40376265 0.026883022 -0.09214541 -0.04669883 -0.06662766
## [4,] 0.13243664 0.41809289 -0.173661497 0.11144936 0.11562853 0.12875187
## [5,] 0.02206691 0.03827488 0.003833234 0.01562716 -0.01743323 0.02104828
## [6,] 0.15927375 0.38151866 -0.109604595 0.07912200 0.09518255 0.08704394
## [7,] -0.21092237 0.10643343 -0.065002767 0.15670463 0.01618362 -0.02249547
## [8,] 0.77435484 -0.10079168 0.277429706 0.08394239 0.14980646 0.00619606
## [9,] -0.07991078 0.05657374 0.110903930 -0.02934633 0.03371643 0.02863997
## [10,] -0.10579883 -0.02716694 0.068856739 0.72073570 0.04314080 0.18256212
## [11,] -0.38810861 0.35333907 0.339431751 0.10930211 -0.19748002 -0.01344081
## [12,] -0.11009151 0.02301842 -0.075818152 0.02817308 -0.10272989 -0.09348576
## [13,] -0.03921756 -0.25764932 -0.280599234 -0.03487717 0.16625816 0.53421326
## [14,] 0.17030944 0.04687785 -0.284226001 0.02607905 -0.32977903 -0.65060087
## [15,] -0.05979428 0.07734440 0.055652677 0.01935091 -0.08657902 -0.02637891
## [16,] 0.07262592 -0.10209993 -0.129929638 -0.28935751 -0.58375662 0.36421183
## [17,] 0.03631152 0.11933305 0.171867281 0.05019239 0.02742122 0.02542031
## [18,] -0.08873815 0.22475321 0.443952943 -0.52471398 0.27348331 0.01296648
## [,17] [,18]
## [1,] -0.046488149 -0.009987675
## [2,] -0.002678718 0.040946536
## [3,] -0.036322639 -0.456492433
## [4,] -0.700471668 -0.135250566
## [5,] -0.043652817 -0.849009982
## [6,] 0.707238947 -0.211806259
## [7,] 0.006002004 -0.007558793
## [8,] -0.034345938 -0.006984421
## [9,] -0.006794095 -0.004089904
## [10,] 0.023526060 -0.003893175
## [11,] -0.010762926 0.038807822
## [12,] -0.010029600 0.017158024
## [13,] 0.015796409 0.049106822
## [14,] -0.019381040 -0.003233845
## [15,] 0.011018207 0.005824470
## [16,] -0.006146502 0.025423657
## [17,] 0.016820195 0.002259013
## [18,] -0.025696349 -0.028638351
Creation of the principal components:
# Now, with PCA we are going to reduce from dimension 20 to dim 2:
pca = prcomp(data_num, scale=T)
# Notice that we have scaled the data
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.1514 1.4017 1.24061 1.01963 0.99543 0.81557 0.64431
## Proportion of Variance 0.5517 0.1092 0.08551 0.05776 0.05505 0.03695 0.02306
## Cumulative Proportion 0.5517 0.6609 0.74639 0.80415 0.85920 0.89615 0.91922
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.5401 0.52120 0.44410 0.41664 0.37850 0.35141 0.31566
## Proportion of Variance 0.0162 0.01509 0.01096 0.00964 0.00796 0.00686 0.00554
## Cumulative Proportion 0.9354 0.95051 0.96147 0.97111 0.97907 0.98593 0.99147
## PC15 PC16 PC17 PC18
## Standard deviation 0.27072 0.26127 0.08250 0.07234
## Proportion of Variance 0.00407 0.00379 0.00038 0.00029
## Cumulative Proportion 0.99554 0.99933 0.99971 1.00000
How many components?
fviz_screeplot(pca, addlabels = TRUE)
# With one component we can explain 55% of the variance of our data.
# With 2, more than 60%.
# First component:
# What we see is the contribution of variables to components.
barplot(pca$rotation[,1], las=2, col="darkblue")
# We can also do the square loadings plot, which is easier to understand.
# So let's plot squared loadings instead
# They are called contribution of variables to component
fviz_contrib(pca, choice = "var", axes = 1)
# The first component gives more importance to the variables Life Expectancy, Cell phones, Business Freedom... to classify the countries.
# With this, we can guess that it is going to classify the countries for their quality of life level.
# Now we can rank the countries by their first PC scores:
names = data[,1]
# The best
names[order(pca$x[,1])][(length(names)-10):length(names)]
## [1] "Iceland" "Denmark" "United States" "Singapore"
## [5] "Sweden" "Australia" "Ireland" "Netherlands"
## [9] "Switzerland" "Norway" "Luxembourg"
# The worst
names[order(pca$x[,1])][1:10]
## [1] "Afghanistan" "Sierra Leone"
## [3] "Chad" "Congo, Dem. Rep."
## [5] "Niger" "Central African Republic"
## [7] "Guinea-Bissau" "Liberia"
## [9] "Burundi" "Angola"
Now, the second component:
# Now, let's compute the second component:
barplot(pca$rotation[,2], las=2, col="lawngreen")
# Contribution of variables to second component
fviz_contrib(pca, choice = "var", axes = 2)
# In this case, business freedom and cell phones, which were very important for the first component, are among the ones with less importance for the second comopnent.
# Now we can rank the countries by their second PC scores:
names[order(pca$x[,2])][1:10] # Countries with high infant mortality
## [1] "Sierra Leone" "Niger" "Mali" "Somalia"
## [5] "Burkina Faso" "Guinea-Bissau" "Chad" "Angola"
## [9] "Benin" "Liberia"
names[order(pca$x[,2])][(length(names)-10):length(names)]
## [1] "Dominica" "Uzbekistan" "China" "Algeria"
## [5] "Syria" "Belarus" "Turkmenistan" "Iran"
## [9] "Cuba" "Venezuela" "Korea, North "
# Once we have interpreted the meaning of the first two components, let's see the contribution of each country to components
head(get_pca_ind(pca)$contrib[,1]) # this is in %
## 1 2 3 4 5 6
## 1.981209531 0.041269022 0.044932225 1.593738153 0.023609652 0.001467308
head((pca$x[,1]^2)/(pca$sdev[1]^2))/dim(data_num)[1] # between 0 and 1
## 1 2 3 4 5 6
## 1.981210e-02 4.126902e-04 4.493222e-04 1.593738e-02 2.360965e-04 1.467308e-05
# Countries contributions to first component:
# The top 50 contributions
fviz_contrib(pca, choice = "ind", axes = 1, top=50)
# All contributions
fviz_contrib(pca, choice = "ind", axes = 1)
# The top 10 countries that contribute to the first component
names[order(get_pca_ind(pca)$contrib[,1],decreasing=T)][1:10]
## [1] "Luxembourg" "Afghanistan"
## [3] "Sierra Leone" "Norway"
## [5] "Chad" "Congo, Dem. Rep."
## [7] "Niger" "Central African Republic"
## [9] "Guinea-Bissau" "Switzerland"
# Finally, let's make a zoom to see the top-30 countries in contributions.
# Also, let's plot it in a graph to see it clearer
names_z1 = names[order(get_pca_ind(pca)$contrib[,1],decreasing=T)]
fviz_contrib(pca, choice = "ind", axes = 1, top=30)+
scale_x_discrete(labels=names_z1)
Biplot
# observations and variables in same graph (using first 2 components as axes)
biplot(pca)
# variables around the center of the graph doesn't contribute much
# to any of the to PC, whereas variables in the corners are the most significant to both principal components
# Let's remove countries from the graph so we can see clearer the contribution of each variable
fviz_pca_var(pca, col.var = "contrib")
# Another different look:
fviz_pca_biplot(pca, repel = TRUE)
## Warning: ggrepel: 129 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Scores:
data.frame(z1=-pca$x[,1],z2=pca$x[,2]) %>%
ggplot(aes(z1,z2,label=names)) + geom_point(size=0) +
labs(title="PCA", x="PC1", y="PC2") +
theme_bw() + scale_color_gradient(low="grey", high="black")+
theme(legend.position="bottom") + geom_text(size=2, hjust=0.6, vjust=0, check_overlap = TRUE)
# The first two PCs are always uncorrelated.
# We can change the color to see the relation for different variables:
data.frame(z1=-pca$x[,1],z2=pca$x[,2]) %>%
ggplot(aes(z1,z2,label=names,color=data_num$Life_Expentancy)) + geom_point(size=0) +
labs(title="PCA", x="PC1", y="PC2") +
theme_bw() + scale_color_gradient(low="lightblue", high="darkblue")+
theme(legend.position="bottom") + geom_text(size=2, hjust=0.6, vjust=0, check_overlap = TRUE)
# We can see that, the first component is highly correlated with Life Expentacy.
# PC1 is also kind of related with GNI per capita PPP:
data.frame(z1=-pca$x[,1],z2=pca$x[,2]) %>%
ggplot(aes(z1,z2,label=names,color=data_num$GNI_Capita_PPP)) + geom_point(size=0) +
labs(title="PCA", x="PC1", y="PC2") +
theme_bw() + scale_color_gradient(low="lightblue", high="darkblue")+
theme(legend.position="bottom") + geom_text(size=2, hjust=0.6, vjust=0, check_overlap = TRUE)
# Relation between PC1 and number of cell phones:
data.frame(z1=-pca$x[,1],z2=pca$x[,2]) %>%
ggplot(aes(z1,z2,label=names,color=data_num$Cell_phones)) + geom_point(size=0) +
labs(title="PCA", x="PC1", y="PC2") +
theme_bw() + scale_color_gradient(low="grey", high="black")+
theme(legend.position="bottom") + geom_text(size=2, hjust=0.6, vjust=0, check_overlap = TRUE)
# Which are the regions with the better countries to live?
region = data[,20]
data.frame(z1=-pca$x[,1],region=region) %>%
group_by(region) %>% summarise(mean=mean(z1)) %>% arrange(desc(mean))
# Europe is the region with the better countries to live of the world (overall),
# followed by the American continent. According to this, the worst
# region to live is the Sub-Saharan Africa.
# I think it broadly coincides with the perception we all have.
data.f <- factanal(data_num, factors = 3, rotation="none", scores="regression", lower = 0.01)
data.f
##
## Call:
## factanal(x = data_num, factors = 3, scores = "regression", rotation = "none", lower = 0.01)
##
## Uniquenesses:
## Fertility GNI_Capita_PPP Adult_Mortality
## 0.219 0.309 0.010
## Infant_Mortality Life_Expentancy Under5_mortality
## 0.010 0.010 0.010
## CO2 Cell_phones Inflation
## 0.762 0.268 0.908
## Business_Freedom Internet_users Democracy
## 0.195 0.244 0.763
## Judical_Effectiveness Property_Rights Unemployment
## 0.229 0.122 0.924
## GDP_capita_PPP Education_Equality Economic_Freedom
## 0.307 0.637 0.260
##
## Loadings:
## Factor1 Factor2 Factor3
## Fertility -0.853 0.216
## GNI_Capita_PPP 0.620 0.523 0.184
## Adult_Mortality -0.928 0.108 -0.343
## Infant_Mortality -0.972 0.213
## Life_Expentancy 0.990 0.107
## Under5_mortality -0.965 0.239
## CO2 0.425 0.223
## Cell_phones 0.706 0.480
## Inflation -0.293
## Business_Freedom 0.743 0.503
## Internet_users 0.647 0.560 0.152
## Democracy 0.299 0.380
## Judical_Effectiveness 0.589 0.647
## Property_Rights 0.633 0.679 0.127
## Unemployment -0.274
## GDP_capita_PPP 0.634 0.513 0.165
## Education_Equality 0.505 -0.324
## Economic_Freedom 0.574 0.630 0.119
##
## Factor1 Factor2 Factor3
## SS loadings 8.307 2.912 0.598
## Proportion Var 0.462 0.162 0.033
## Cumulative Var 0.462 0.623 0.657
##
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 738.49 on 102 degrees of freedom.
## The p-value is 3.84e-97
cbind(data.f$loadings, data.f$uniquenesses)
## Factor1 Factor2 Factor3
## Fertility -0.853186556 -0.07757631 0.21615584 0.2193291
## GNI_Capita_PPP 0.619970107 0.52268354 0.18375539 0.3086757
## Adult_Mortality -0.928104127 0.10767357 -0.34276064 0.0100000
## Infant_Mortality -0.972186499 0.00201423 0.21278436 0.0100000
## Life_Expentancy 0.990251065 0.01323967 0.10662610 0.0100000
## Under5_mortality -0.965139112 0.05283313 0.23901820 0.0100000
## CO2 0.425124223 0.22335847 0.08864518 0.7615293
## Cell_phones 0.706239620 0.47952894 0.05550602 0.2682003
## Inflation -0.009932774 -0.29261929 -0.07776372 0.9082518
## Business_Freedom 0.742999288 0.50250693 0.01002447 0.1953351
## Internet_users 0.647495676 0.55987547 0.15171784 0.2442742
## Democracy 0.298903870 0.38038885 0.04998289 0.7634613
## Judical_Effectiveness 0.588927910 0.64650782 0.07754008 0.2291771
## Property_Rights 0.633346407 0.67904498 0.12693440 0.1216577
## Unemployment 0.030682211 0.02282905 -0.27387600 0.9235206
## GDP_capita_PPP 0.634374042 0.51335112 0.16545675 0.3066681
## Education_Equality 0.505307469 0.05129792 -0.32361272 0.6373353
## Economic_Freedom 0.574093553 0.62953589 0.11928630 0.2598670
# var explained by first three factors is around 66%
par(mfrow=c(3,1)) # This is to view the three graphs at the same time
barplot(data.f$loadings[,1], names=F, las=2, col="darkblue", ylim = c(-1, 1))
barplot(data.f$loadings[,2], names=F, las=2, col="darkblue", ylim = c(-1, 1))
barplot(data.f$loadings[,3], las=2, col="darkblue", ylim = c(-1, 1))
With two factors it looks as follows
data.f2 <- factanal(data_num, factors = 2, rotation="varimax", scores="Bartlett", lower = 0.01)
data.f2
##
## Call:
## factanal(x = data_num, factors = 2, scores = "Bartlett", rotation = "varimax", lower = 0.01)
##
## Uniquenesses:
## Fertility GNI_Capita_PPP Adult_Mortality
## 0.225 0.302 0.282
## Infant_Mortality Life_Expentancy Under5_mortality
## 0.010 0.085 0.010
## CO2 Cell_phones Inflation
## 0.754 0.262 0.912
## Business_Freedom Internet_users Democracy
## 0.200 0.242 0.778
## Judical_Effectiveness Property_Rights Unemployment
## 0.258 0.135 0.984
## GDP_capita_PPP Education_Equality Economic_Freedom
## 0.294 0.686 0.264
##
## Loadings:
## Factor1 Factor2
## Fertility -0.389 -0.789
## GNI_Capita_PPP 0.784 0.290
## Adult_Mortality -0.433 -0.728
## Infant_Mortality -0.378 -0.920
## Life_Expentancy 0.491 0.821
## Under5_mortality -0.324 -0.941
## CO2 0.424 0.257
## Cell_phones 0.747 0.425
## Inflation -0.271 0.119
## Business_Freedom 0.762 0.468
## Internet_users 0.812 0.313
## Democracy 0.457 0.112
## Judical_Effectiveness 0.821 0.262
## Property_Rights 0.889 0.272
## Unemployment 0.113
## GDP_capita_PPP 0.780 0.312
## Education_Equality 0.171 0.534
## Economic_Freedom 0.824 0.240
##
## Factor1 Factor2
## SS loadings 6.489 4.829
## Proportion Var 0.361 0.268
## Cumulative Var 0.361 0.629
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 1143.42 on 118 degrees of freedom.
## The p-value is 2e-167
cbind(data.f2$loadings, data.f2$uniquenesses)
## Factor1 Factor2
## Fertility -0.3892152 -0.7893461 0.22544824
## GNI_Capita_PPP 0.7838405 0.2896214 0.30171420
## Adult_Mortality -0.4333226 -0.7279830 0.28225480
## Infant_Mortality -0.3784391 -0.9204879 0.01000000
## Life_Expentancy 0.4914202 0.8205697 0.08517211
## Under5_mortality -0.3240280 -0.9413097 0.01000000
## CO2 0.4244565 0.2569622 0.75377230
## Cell_phones 0.7465727 0.4253317 0.26172656
## Inflation -0.2713886 0.1188333 0.91221839
## Business_Freedom 0.7623408 0.4680862 0.19972464
## Internet_users 0.8124896 0.3132018 0.24176451
## Democracy 0.4572141 0.1120195 0.77838616
## Judical_Effectiveness 0.8206817 0.2622013 0.25773601
## Property_Rights 0.8892529 0.2719540 0.13526989
## Unemployment -0.0572678 0.1133762 0.98375625
## GDP_capita_PPP 0.7803381 0.3116981 0.29391309
## Education_Equality 0.1706461 0.5335385 0.68621291
## Economic_Freedom 0.8237478 0.2397151 0.26397122
par(mfrow=c(2,1))
barplot(data.f2$loadings[,1], names=F, las=2, col="darkblue", ylim = c(-1, 1))
barplot(data.f2$loadings[,2], las=2, col="darkblue", ylim = c(-1, 1))
# The first two factors can be interpreted, from my point of view, as follows:
# one with more importance to health variables (life expentancy, mortality...),
# and the second one with more weights to economic variables (PIB per capita, economic freedom...)
# The third factor is a bit more difficult to decribe.
# Distribution of the score of the countries with each factor:
factor.df1 = data.frame(Country=data$Country, data.f2$scores) %>% gather("factor", "score", -Country)
factor.df1 %>%
ggplot(aes(x=Country,y=score)) + geom_point(size=1) +
theme_bw() + theme(legend.position="bottom") + scale_color_brewer(palette="Dark2") +
facet_wrap(~factor, ncol =1) +
labs(title="2-factor model", x="", y="scores", col="")
# The grpah where we are going to plot the countries is the same
# that we did before with the two principal components (the axes are the PC)
data.frame(z1=-pca$x[,1],z2=pca$x[,2]) %>%
ggplot(aes(z1,z2,label=names)) + geom_point(size=0) +
labs(title="PCA", x="PC1", y="PC2") +
theme_bw() +theme(legend.position="bottom") +
geom_text(size=2, hjust=0.6, vjust=0, check_overlap = TRUE)
# Scale the data:
X = scale(data_num)
How many centers?
# Based on wss (total within sum of square)
fviz_nbclust(X, kmeans, method = 'wss')
# Here, based on the "elbow method" we could guess that the optimum
# number of centers is 3.
# Based on shiloutte
fviz_nbclust(X, kmeans, method = 'silhouette')
# Give us that 2 and 3 are the two best number of centers
# Based on the gap statistic (using bootstrap)
fviz_nbclust(X, kmeans, method = 'gap_stat', k.max = 20)
# According to the gap statistic, we should select 3 centers
After executing the three methods I beleive that the optimum number of centers is 3.
##Kmeans
fit = kmeans(X, centers=3, nstart=100)
groups = fit$cluster
groups
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 2 1 1 2 1 1 3 3 1 3 3 1 3 1 3 1 2 1 1 1
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 1 3 3 2 2 1 1 2 3 2 2 3 1 1 2 2 2 1 3
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 1 3 3 3 2 1 1 1 1 1 2 2 3 2 2 1 3 3 1 2
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 3 2 3 1 2 2 1 1 1 3 3 1 1 1 2 3 3 3 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 3 1 1 2 1 1 3 3 1 2 3 1 2 2 1 3 3 1 2 2
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 3 1 2 3 2 3 1 1 1 1 1 1 2 2 1 2 3 3 1 2
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 2 3 1 2 1 1 1 1 1 3 3 3 3 1 2 1 1 1 1 1
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 158 159 160 161
## 1 1 2 3 3 3 1 2 1 3 1 2 1 3 3 1 1 2 1 2
## 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 2 1 1 1 1 1 2 1 3 3 3 1 1 1 1 1 2 2 2
# Are the groups well balanced?
barplot(table(groups), col="blue")
# Groups 2 and 3 have a similar dimension, 1 it is the one that is bigger. But I would say that it is more or less a normal distribution of all our data. It is not that we have a lot of countries in one group and the others are almost empty. In this case, the three groups have a reasonable number of countries
# Variables for each center:
centers=fit$centers
centers
## Fertility GNI_Capita_PPP Adult_Mortality Infant_Mortality Life_Expentancy
## 1 -0.2720572 -0.3750661 -0.1898215 -0.2619509 0.1770181
## 2 1.3777453 -0.7361939 1.2674339 1.3886070 -1.3708876
## 3 -0.8270098 1.3340912 -0.8644276 -0.8545316 0.9834960
## Under5_mortality CO2 Cell_phones Inflation Business_Freedom
## 1 -0.3183422 -0.2055099 -0.2203137 0.10703621 -0.08075569
## 2 1.4053888 -0.5682594 -0.9786780 -0.05056037 -1.03517861
## 3 -0.7736151 0.8857707 1.2964397 -0.13602580 1.11023865
## Internet_users Democracy Judical_Effectiveness Property_Rights Unemployment
## 1 -0.3688983 -0.04221718 -0.2031830 -0.2694080 0.2758798
## 2 -0.7684335 -0.55341482 -0.8147002 -0.8476192 -0.2340903
## 3 1.3537836 0.59190459 1.1131343 1.2575664 -0.2531785
## GDP_capita_PPP Education_Equality Economic_Freedom
## 1 -0.3326723 0.3481310 -0.2152980
## 2 -0.7796751 -1.0085497 -0.7667696
## 3 1.3022352 0.3500057 1.0889068
# Who are the countries in the first group?
i=1 # plotting the centers in cluster 1
bar1=barplot(centers[i,], las=2, col="darkblue", ylim=c(-2,2),
main=paste("Cluster", i,": Group center in blue, global center in red"))
points(bar1,y=apply(X, 2, quantile, 0.50),col="red",pch=19)
# Third-word countries
# Second group
i=2 # plotting the centers in cluster 2
bar2=barplot(centers[i,], las=2, col="darkblue", ylim=c(-2,2),
main=paste("Cluster", i,": Group center in blue, global center in red"))
points(bar2,y=apply(X, 2, quantile, 0.50),col="red",pch=19)
# We could guess that they are Developing countries
# Third group
i=3 # plotting the centers in cluster 3
bar3=barplot(centers[i,], las=2, col="darkblue", ylim=c(-2,2),
main=paste("Cluster", i,": Group center in blue, global center in red"))
points(bar2,y=apply(X, 2, quantile, 0.50),col="red",pch=19)
# Countries with less fertility than the mean, higher GDP and Life expectancy... It seems this are richer countries (firs-world countries)
# I have notice that when running the code several times, the number of the group changes. I always get the same clusters, but sometimes the richer countries are in group 3 and sometimes in 1. I say this beacuse if the coments does not coincide with the graphs, it is because it has changed the number of the group. Nevertheless, the clusters I get is always the same, so the conclusions are equally valid, the only thing that changes is the number of the group.
Clusplot
fviz_cluster(fit, data = X, geom = c("point"),ellipse.type = 'norm', pointsize=1)+
theme_minimal()+geom_text(label=names,hjust=0, vjust=0,size=2,check_overlap = T)+scale_fill_brewer(palette="Paired")
# After watching the plot, we can confirm what we guess with the barcharts.
# Broadly, group 3 includes first-word countries (high-developed countries)
# such as Australia, Singapore, Netherlands... In the first group
# we can find south-american countries (Argentina, Brazil),
# the poorer countries in Europe (Montenegro, Bosnia), and some north-african
# countries (Algeria, Egypt, Cameroon)
# Finally, in the second group we have the sub-saharian countries mainly
Silhouette plot
# The silhouette value in [-1,1] measures the similarity (cohesion) of a data point to its cluster relative to other clusters (separation).
# Silhouette plots rely on a distance metric and suggest that the data matches its own cluster well.
# The larger the silhouette widths, the better.
d <- dist(X, method="euclidean")
sil = silhouette(groups, d)
plot(sil, col=1:5, main="", border=NA)
summary(sil)
## Silhouette of 179 units in 3 clusters from silhouette.default(x = groups, dist = d) :
## Cluster sizes and average silhouette widths:
## 84 46 49
## 0.2760713 0.2787378 0.3992127
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.02774 0.20151 0.32765 0.31047 0.42400 0.58512
# Our average silhouette width is 0.31, which is pretty good.
Profile variables
# From the dataset with all of our data let's get some variables that we exclude from the clustering and let's see if we can draw any other conclusions:
age = total.data$Population.median.age..years.
age = age[-which(vec>6)] # remember that in the preprocessing we removed
# a row which has many missing values. We have to do this again so our
# vector has the appropiate length.
summary(age) # although there are some NAs, the graph automatically is not going to plot the missing values
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 15.00 20.00 25.00 26.85 35.00 43.00 3
as.data.frame(X) %>% mutate(cluster=factor(groups), names=names, Age=age) %>%
ggplot(aes(x = cluster, y = Age)) +
geom_boxplot(fill="darkblue") +
labs(title = "Age by cluster", x = "", y = "", col = "")
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
# Conclusion: The median age is higher the more developed the country is.
# This is something generally know, in poorer countries the birth rate
# is higher and life expectancy lower, resulting in lower median ages.
Antoher interesnting graphs
urban_population = total.data$Urban_population_pct_of_total; urban_population = urban_population[-which(vec>6)]
total_population = total.data$Population..Millions. ; total_population = total_population[-which(vec>6)]
as.data.frame(X) %>% mutate(cluster=factor(groups), names=names, Population=total_population) %>%
ggplot(aes(x = cluster, y = Population)) +
geom_boxplot(fill="darkblue") + scale_y_continuous(trans='log10')+
labs(title = "Minutes played by cluster", x = "", y = "", col = "")
# Population is not relevant to determine the quality of life of a country.
# We may think that bigger economies may have more power or influence to
# impose their interest globlally and take advantage, but we have seen
# that in fact the size of a country/economy is not important. In fact,
# some countries that have been leading our research are really small (Luxembourg, Switzerland...)
as.data.frame(X) %>% mutate(cluster=factor(groups), names=names, Urban_pop=urban_population) %>%
ggplot(aes(x = cluster, y = Urban_pop)) +
geom_boxplot(fill="darkblue") +
labs(title = "Minutes played by cluster", x = "", y = "", col = "")
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
# In higher-developped countries people live more in cities, there is more urban population
# kmeans with MAHALANOBIS distance
S_x <- cov(data_num)
iS <- solve(S_x)
e <- eigen(iS)
V <- e$vectors
B <- V %*% diag(sqrt(e$values)) %*% t(V)
Xtil <- scale(data_num,scale = FALSE)
data_num.S <- Xtil %*% B
fit.mahalanobis = kmeans(data_num.S, centers=3, nstart=100)
groups = fit.mahalanobis$cluster
centers=fit.mahalanobis$centers
colnames(centers)=colnames(X)
centers
## Fertility GNI_Capita_PPP Adult_Mortality Infant_Mortality Life_Expentancy
## 1 -0.03367330 -0.3549929 0.011067101 -0.02552415 -0.06443750
## 2 0.14386577 1.5077412 -0.007011833 0.09847801 0.28161306
## 3 0.05069208 0.6817621 -0.681136027 0.21285153 -0.00711553
## Under5_mortality CO2 Cell_phones Inflation Business_Freedom
## 1 -0.04268396 0.01325338 0.03224284 -0.10531093 0.008814855
## 2 0.17825911 -0.06082781 -0.15289425 -0.09580018 -0.037558307
## 3 0.13197002 0.04941535 0.20127063 9.16308997 -0.014957481
## Internet_users Democracy Judical_Effectiveness Property_Rights
## 1 -0.03563722 -0.012433847 0.01360478 -0.02515754
## 2 0.16859189 0.054423234 -0.05418373 0.11723333
## 3 -0.21588603 -0.002746367 -0.08551246 -0.12300683
## Unemployment GDP_capita_PPP Education_Equality Economic_Freedom
## 1 0.02270471 -0.2358220 0.002056248 0.007289119
## 2 -0.10327644 1.0876858 -0.016393959 -0.039286241
## 3 0.06932188 -0.9676286 0.122450446 0.123406376
i=1 # plotting the centers in cluster 1
bar1=barplot(centers[i,], las=2, col="darkblue", ylim=c(-2,2), main=paste("Cluster", i,": Group center in blue, global center in red"))
points(bar1,y=apply(X, 2, quantile, 0.50),col="red",pch=19)
# It stands out that Democracy, which was a variable not very used until now, it is relevant for this factor.
i=2 # plotting the centers in cluster 2
bar2=barplot(centers[i,], las=2, col="darkblue", ylim=c(-2,2), main=paste("Cluster", i,": Group center in blue, global center in red"))
points(bar2,y=apply(X, 2, quantile, 0.50),col="red",pch=19)
# A bit high infant mortality, low democracy and economic freedom score.
i=3 # plotting the centers in cluster 3
bar3=barplot(centers[i,], las=2, col="darkblue", ylim=c(-2,2), main=paste("Cluster", i,": Group center in blue, global center in red"))
points(bar3,y=apply(X, 2, quantile, 0.50),col="red",pch=19)
# This third group is very strange. Specially beacause inflation it is nota variable that many countries have high.
# Let's check how many countries are there in group 3
barplot(table(groups), col="blue")
# There is only 1 country.
# Cluspot
fviz_cluster(fit.mahalanobis, data = X, geom = c("point"),ellipse.type = 'norm', pointsize=1)+
theme_minimal()+geom_text(label=names,hjust=0, vjust=0,size=2,check_overlap = T)+scale_fill_brewer(palette="Paired")
## Too few points to calculate an ellipse
# The only country in group 3 is Venezuela (which is a country with high inflation as we saw).
# We should reduce the number of factors, as we have seen that with three factor we have a group of just one country.
# Mahalanobis with 2 centers:
S_x <- cov(data_num)
iS <- solve(S_x)
e <- eigen(iS)
V <- e$vectors
B <- V %*% diag(sqrt(e$values)) %*% t(V)
Xtil <- scale(data_num,scale = FALSE)
data_num.S <- Xtil %*% B
fit.mahalanobis = kmeans(data_num.S, centers=2, nstart=100)
groups = fit.mahalanobis$cluster
centers=fit.mahalanobis$centers
colnames(centers)=colnames(X)
centers
## Fertility GNI_Capita_PPP Adult_Mortality Infant_Mortality Life_Expentancy
## 1 0.0506920828 0.681762058 -0.681136027 0.212851529 -7.115530e-03
## 2 -0.0005727919 -0.007703526 0.007696452 -0.002405102 8.040147e-05
## Under5_mortality CO2 Cell_phones Inflation Business_Freedom
## 1 0.131970016 0.0494153525 0.201270630 9.1630900 -0.0149574814
## 2 -0.001491187 -0.0005583656 -0.002274244 -0.1035377 0.0001690111
## Internet_users Democracy Judical_Effectiveness Property_Rights
## 1 -0.21588603 -2.746367e-03 -0.0855124571 -0.123006834
## 2 0.00243939 3.103239e-05 0.0009662425 0.001389908
## Unemployment GDP_capita_PPP Education_Equality Economic_Freedom
## 1 0.0693218809 -0.96762859 0.122450446 0.123406376
## 2 -0.0007832981 0.01093366 -0.001383621 -0.001394422
i=1 # plotting the centers in cluster 1
bar1=barplot(centers[i,], las=2, col="darkblue", ylim=c(-2,2), main=paste("Cluster", i,": Group center in blue, global center in red"))
points(bar1,y=apply(X, 2, quantile, 0.50),col="red",pch=19)
# Democracy and judical effectiveness are relevant
i=2 # plotting the centers in cluster 2
bar2=barplot(centers[i,], las=2, col="darkblue", ylim=c(-2,2), main=paste("Cluster", i,": Group center in blue, global center in red"))
points(bar2,y=apply(X, 2, quantile, 0.50),col="red",pch=19)
# Low decmocracy score
barplot(table(groups), col="blue")
# Cluspot
fviz_cluster(fit.mahalanobis, data = X, geom = c("point"),ellipse.type = 'norm', pointsize=1)+
theme_minimal()+geom_text(label=names,hjust=0, vjust=0,size=2,check_overlap = T)+scale_fill_brewer(palette="Paired")
## Too few points to calculate an ellipse
# This classifications also makes sense, countries are divided in 2, the wealthier and with higher living standard, and the poorer.
How similar are the clusters?
adjustedRandIndex(fit$cluster, fit.mahalanobis$cluster)
## [1] -0.007383256
# The value close to 1 indicates a high correlation. As we have obtain 0.16,
# we can see that the clusters change significantly depending on the method we use
# How many groups?
fviz_nbclust(scale(X), pam, method = 'silhouette', k.max = 10)
fviz_nbclust(scale(X), pam, method = 'gap_stat', k.max = 10, nboot = 500)
fviz_nbclust(scale(X), pam, method = 'wss', k.max = 10, nboot = 500)
# Let's select 3 centers
# Visualization of clusters
fit.pam <- eclust(X, "pam", stand=TRUE, k=3, graph=F)
fviz_cluster(fit.pam, data = X, geom = c("point"), pointsize=1)+
theme_minimal()+geom_text(label=names,hjust=0, vjust=0,size=2,check_overlap = F)+scale_fill_brewer(palette="Paired")
centers2=fit.pam$medoids
barplot(centers2[1,], las=2, col="darkblue", ylim = c(-2,2))
# High fertility and mortality
barplot(centers2[2,], las=2, col="darkblue", ylim = c(-2,2))
barplot(centers2[3,], las=2, col="darkblue", ylim = c(-2,2))
# High GDP, freedom, Life expectancy (this is the group of the best countries)
adjustedRandIndex(fit$cluster, fit.pam$clustering)
## [1] 0.8150562
# Very similar the kmeans and the pam in this case.
map = data.frame(country=names, value=as.factor(fit.pam$clustering))
#map = data.frame(country=names, value=fit.kmeans$cluster)
#Convert the country code into iso3c using the function countrycode()
map$country = countrycode(map$country, 'country.name', 'iso3c')
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: Micronesia
#Create data object supporting the map
matched <- joinCountryData2Map(map, joinCode = "ISO3",nameJoinColumn = "country")
## 178 codes from your data successfully matched countries in the map
## 1 codes from your data failed to match with a country code in the map
## 65 codes from the map weren't represented in your data
#Draw the map
mapCountryData(matched,nameColumnToPlot="value",missingCountryCol = "white",borderCol = "#C7D9FF",catMethod = "pretty", colourPalette = "rainbow", mapTitle = c("Clusters"), lwd=1)
## using catMethod='categorical' for non numeric data in mapCountryData
# Now we can see which country belong to each group but in the map, that is more visual.
KERNEL KMEANS
fit.ker <- kkmeans(as.matrix(X), centers=3, kernel="rbfdot") # Radial Basis kernel (Gaussian)
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
# By default, Gaussian kernel is used
# By default, sigma parameter is estimated
centers(fit.ker)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -0.8443745 1.285569 -0.8385352 -0.8482764 0.9733174 -0.7685035
## [2,] 1.0213562 -0.573755 0.9616292 1.0658178 -1.0583195 1.0516781
## [3,] -0.3573890 -0.351814 -0.3068331 -0.3953265 0.3033075 -0.4367544
## [,7] [,8] [,9] [,10] [,11] [,12]
## [1,] 0.5595351 1.3005775 -0.1352464 1.14359718 1.4080298 0.74972753
## [2,] -0.2163829 -0.7679019 0.2273149 -0.93123076 -0.6784393 -0.49933477
## [3,] -0.1835656 -0.1847728 -0.1154241 0.07128218 -0.3396482 -0.05476961
## [,13] [,14] [,15] [,16] [,17] [,18]
## [1,] 1.2032798 1.3116217 -0.2270884 1.2122482 0.3503976 1.11800710
## [2,] -0.7728403 -0.8044215 -0.1568437 -0.5974878 -0.6144103 -0.75322875
## [3,] -0.1139886 -0.1589517 0.2978885 -0.2802020 0.3223067 -0.07381047
size(fit.ker)
## [1] 47 63 69
withinss(fit.ker)
## [1] 1434.8907 2155.9962 493.2292
object.ker = list(data = X, cluster = fit.ker@.Data)
fviz_cluster(object.ker, geom = c("point"), ellipse=F,pointsize=2)+
theme_minimal()+geom_text(label=names,hjust=0, vjust=0,size=3,check_overlap = T)+scale_fill_brewer(palette="Paired")
# We get similar clusters to the ones that we have obtained with previous methods.
HIERARCHICAL CLUSTERING
?dist #stats
## starting httpd help server ... done
?hclust
d = dist(scale(X), method='euclidean')
hc <- hclust(d, method = 'ward.D2') # Ward's minimum variance method
# Visualization with a dendrogram
# Classical dendrogram:
hc$labels <- names
fviz_dend(x = hc,
k=3,
palette = "jco",
rect = TRUE, rect_fill = TRUE,
rect_border = "jco"
)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
# Difficult to visualize the countries
# Let's use a phylogenic tree:
fviz_dend(x = hc,
k = 3,
color_labels_by_k = TRUE,
cex = 0.8,
type = "phylogenic",
repel = TRUE)+ labs(title="Socio-economic-health tree clustering of the world") + theme(axis.text.x=element_blank(),axis.text.y=element_blank())
## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Much better to vusalize it
In a map
groups.hc = cutree(hc, k = 3)
# Map our PCA index in a map:
map = data.frame(country=names, value=as.factor(groups.hc))
#Convert the country code into iso3c using the function countrycode()
map$country = countrycode(map$country, 'country.name', 'iso3c')
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: Micronesia
#Create data object supporting the map
matched <- joinCountryData2Map(map, joinCode = "ISO3",
nameJoinColumn = "country")
## 178 codes from your data successfully matched countries in the map
## 1 codes from your data failed to match with a country code in the map
## 65 codes from the map weren't represented in your data
#Draw the map
mapCountryData(matched,nameColumnToPlot="value",missingCountryCol = "white",
borderCol = "#C7D9FF",
catMethod = "pretty", colourPalette = "rainbow",
mapTitle = c("Clusters"), lwd=1)
## using catMethod='categorical' for non numeric data in mapCountryData
# Very similar to the one we obtained before,although there are slight
# differences (Uruguay, for instance, is now in the group of high-developed countries)
EM CLUSTERING
res.Mclust <- Mclust(X) # X is already scale
summary(res.Mclust)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVI (diagonal, varying volume and shape) model with 6 components:
##
## log-likelihood n df BIC ICL
## -1288.257 179 221 -3722.926 -3724.428
##
## Clustering table:
## 1 2 3 4 5 6
## 47 25 47 27 20 13
# The clustering is probabilistic: for each country we don't have a unique group but the probabilities the country belongs to each of the groups
head(res.Mclust$z)
## [,1] [,2] [,3] [,4] [,5]
## 1 1.000000e+00 0.000000e+00 1.582198e-104 0.000000e+00 1.979405e-13
## 2 1.314821e-46 2.168022e-01 7.831977e-01 7.679911e-95 2.132436e-08
## 3 1.382921e-71 7.364054e-23 9.998201e-01 0.000000e+00 1.798544e-04
## 4 1.000000e+00 0.000000e+00 4.847608e-116 0.000000e+00 2.372697e-08
## 5 2.586627e-154 1.369190e-55 3.355358e-83 0.000000e+00 9.859757e-01
## 6 2.270362e-27 1.293017e-11 9.999996e-01 3.553950e-236 4.041192e-07
## [,6]
## 1 1.645435e-147
## 2 1.105216e-10
## 3 4.999184e-11
## 4 3.321889e-136
## 5 1.402431e-02
## 6 9.150435e-21
# probabilidad de cada país de estar en cada cluster
# Of course the tool assign the group with highest probability
head(res.Mclust$classification)
## 1 2 3 4 5 6
## 1 3 3 1 5 3
# te asigna a cada cluster en función a la probabilidad
fviz_mclust(object = res.Mclust, what = "BIC", pallete = "jco")
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
# 4 groups is what by the graph we can see it is ok
Clusplot
fviz_mclust(object = res.Mclust, what = "classification", geom = "point", pallete = "jco")
# How similar are the clusters?
# Remember: The closer to 1 the more agreement
adjustedRandIndex(res.Mclust$classification, fit.pam$clustering)
## [1] 0.4832715
adjustedRandIndex(res.Mclust$classification, groups.hc)
## [1] 0.3671007
# Between 0.5 and 0.6 in both cases, well we can cocnlcude that
# they are somehow related.
# Visualization in the map
groups.mclust = res.Mclust$classification
# Map our PCA index in a map:
map = data.frame(country=names, value=groups.mclust)
#Convert the country code into iso3c using the function countrycode()
map$country = countrycode(map$country, 'country.name', 'iso3c')
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: Micronesia
#Create data object supporting the map
matched <- joinCountryData2Map(map, joinCode = "ISO3",
nameJoinColumn = "country")
## 178 codes from your data successfully matched countries in the map
## 1 codes from your data failed to match with a country code in the map
## 65 codes from the map weren't represented in your data
#Draw the map
mapCountryData(matched,nameColumnToPlot="value",missingCountryCol = "white",
borderCol = "#C7D9FF",
catMethod = "pretty", colourPalette = "topo",
mapTitle = c("Clusters"), lwd=1)
## You asked for 7 categories, 5 were used due to pretty() classification
# Now we have more groups, but we can observe that in all the clusters we have made, broadly, North America, Europe, Australia, Japan are together and are selected as the best countries to live.
Heatmaps
# A heat map is a false color image (based on data frame X) with a
# dendrogram added to the left side and to the top
heatmap(scale(X), scale = "none",
distfun = function(x){dist(x, method = "euclidean")},
hclustfun = function(x){hclust(x, method = "ward.D2")},
cexRow = 0.7)
# The darker the color, the higher the correlation
# The higher on the dendogram, the more important.
# For instance, obervation 89 is highly explained by CO2.
# Also, the observation 8 (Austria) is mainly explained by the
# 'more important' variables (the ones that explained a bigger part)
# of our dataset (cell phones, life expectancy, economic freedom, internet users...)
All of my code is inspired in what we have seen in class. That is the main reference I have used. Apart froma that, I found some interesting ways to do the graphs in: https://r-graph-gallery.com/ggplot2-package.html Also, to solve the doubts that I have while making the code I usually look for the answer in https://stackoverflow.com/ and in https://rpubs.com/